blob: 735405681ff8c8b445b291e1b5a4fc9fc6cebbb5 [file] [log] [blame]
Eli Bendersky84fae782013-03-09 07:12:48 -08001"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
Eli Benderskybf05df22013-04-20 05:44:01 -070036#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See http://www.python.org/psf/license for licensing details.
Armin Rigo9ed73062005-12-14 18:10:45 +000039#
40# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +000041# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000042#
43# fredrik@pythonware.com
44# http://www.pythonware.com
Armin Rigo9ed73062005-12-14 18:10:45 +000045# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000048# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000049#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000078 "fromstring", "fromstringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000079 "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000080 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000081 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000084 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000085 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000086 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010087 "XML", "XMLID",
Martin Panterdcfebb32016-04-01 06:55:55 +000088 "XMLParser", "XMLPullParser",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010089 "register_namespace",
Armin Rigo9ed73062005-12-14 18:10:45 +000090 ]
91
Florent Xiclunaf15351d2010-03-13 23:24:31 +000092VERSION = "1.3.0"
93
Florent Xiclunaf15351d2010-03-13 23:24:31 +000094import sys
95import re
96import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +030097import io
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +020098import collections
Eli Bendersky00f402b2012-07-15 06:02:22 +030099import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +0000100
Eli Bendersky27cbb192012-06-15 09:03:19 +0300101from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000102
Armin Rigo9ed73062005-12-14 18:10:45 +0000103
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000104class ParseError(SyntaxError):
Eli Bendersky84fae782013-03-09 07:12:48 -0800105 """An error when parsing an XML document.
106
107 In addition to its exception value, a ParseError contains
108 two extra attributes:
109 'code' - the specific exception code
110 'position' - the line and column of the error
111
112 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000113 pass
114
115# --------------------------------------------------------------------
116
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000117
118def iselement(element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800119 """Return True if *element* appears to be an Element."""
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100120 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000121
Armin Rigo9ed73062005-12-14 18:10:45 +0000122
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000123class Element:
Eli Bendersky84fae782013-03-09 07:12:48 -0800124 """An XML element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000125
Eli Bendersky84fae782013-03-09 07:12:48 -0800126 This class is the reference implementation of the Element interface.
127
128 An element's length is its number of subelements. That means if you
Serhiy Storchaka56a6d852014-12-01 18:28:43 +0200129 want to check if an element is truly empty, you should check BOTH
Eli Bendersky84fae782013-03-09 07:12:48 -0800130 its length AND its text attribute.
131
132 The element tag, attribute names, and attribute values can be either
133 bytes or strings.
134
135 *tag* is the element name. *attrib* is an optional dictionary containing
136 element attributes. *extra* are additional element attributes given as
137 keyword arguments.
138
139 Example form:
140 <tag attrib>text<child/>...</tag>tail
141
142 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000143
144 tag = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800145 """The element's name."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000146
147 attrib = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800148 """Dictionary of the element's attributes."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000149
150 text = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800151 """
152 Text before first subelement. This is either a string or the value None.
153 Note that if there is no text, this attribute may be either
154 None or the empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000155
Eli Bendersky84fae782013-03-09 07:12:48 -0800156 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000157
Eli Bendersky84fae782013-03-09 07:12:48 -0800158 tail = None
159 """
160 Text after this element's end tag, but before the next sibling element's
161 start tag. This is either a string or the value None. Note that if there
162 was no text, this attribute may be either None or an empty string,
163 depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000164
Eli Bendersky84fae782013-03-09 07:12:48 -0800165 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000166
167 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300168 if not isinstance(attrib, dict):
169 raise TypeError("attrib must be dict, not %s" % (
170 attrib.__class__.__name__,))
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000171 attrib = attrib.copy()
172 attrib.update(extra)
Armin Rigo9ed73062005-12-14 18:10:45 +0000173 self.tag = tag
174 self.attrib = attrib
175 self._children = []
176
177 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300178 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000179
Armin Rigo9ed73062005-12-14 18:10:45 +0000180 def makeelement(self, tag, attrib):
Eli Bendersky84fae782013-03-09 07:12:48 -0800181 """Create a new element with the same type.
182
183 *tag* is a string containing the element name.
184 *attrib* is a dictionary containing the element attributes.
185
186 Do not call this method, use the SubElement factory function instead.
187
188 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000189 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000190
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000191 def copy(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800192 """Return copy of current element.
193
194 This creates a shallow copy. Subelements will be shared with the
195 original tree.
196
197 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000198 elem = self.makeelement(self.tag, self.attrib)
199 elem.text = self.text
200 elem.tail = self.tail
201 elem[:] = self
202 return elem
203
Armin Rigo9ed73062005-12-14 18:10:45 +0000204 def __len__(self):
205 return len(self._children)
206
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000207 def __bool__(self):
208 warnings.warn(
209 "The behavior of this method will change in future versions. "
210 "Use specific 'len(elem)' or 'elem is not None' test instead.",
211 FutureWarning, stacklevel=2
212 )
213 return len(self._children) != 0 # emulate old behaviour, for now
214
Armin Rigo9ed73062005-12-14 18:10:45 +0000215 def __getitem__(self, index):
216 return self._children[index]
217
Armin Rigo9ed73062005-12-14 18:10:45 +0000218 def __setitem__(self, index, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000219 # if isinstance(index, slice):
220 # for elt in element:
221 # assert iselement(elt)
222 # else:
223 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000224 self._children[index] = element
225
Armin Rigo9ed73062005-12-14 18:10:45 +0000226 def __delitem__(self, index):
227 del self._children[index]
228
Eli Bendersky84fae782013-03-09 07:12:48 -0800229 def append(self, subelement):
230 """Add *subelement* to the end of this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000231
Eli Bendersky84fae782013-03-09 07:12:48 -0800232 The new element will appear in document order after the last existing
233 subelement (or directly after the text, if it's the first subelement),
234 but before the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000235
Eli Bendersky84fae782013-03-09 07:12:48 -0800236 """
237 self._assert_is_element(subelement)
238 self._children.append(subelement)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000239
240 def extend(self, elements):
Eli Bendersky84fae782013-03-09 07:12:48 -0800241 """Append subelements from a sequence.
242
243 *elements* is a sequence with zero or more elements.
244
245 """
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200246 for element in elements:
247 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000248 self._children.extend(elements)
249
Eli Bendersky84fae782013-03-09 07:12:48 -0800250 def insert(self, index, subelement):
251 """Insert *subelement* at position *index*."""
252 self._assert_is_element(subelement)
253 self._children.insert(index, subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000254
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200255 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200256 # Need to refer to the actual Python implementation, not the
257 # shadowing C implementation.
Eli Bendersky46955b22013-05-19 09:20:50 -0700258 if not isinstance(e, _Element_Py):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200259 raise TypeError('expected an Element, not %s' % type(e).__name__)
260
Eli Bendersky84fae782013-03-09 07:12:48 -0800261 def remove(self, subelement):
262 """Remove matching subelement.
263
264 Unlike the find methods, this method compares elements based on
265 identity, NOT ON tag value or contents. To remove subelements by
266 other means, the easiest way is to use a list comprehension to
267 select what elements to keep, and then use slice assignment to update
268 the parent element.
269
270 ValueError is raised if a matching element could not be found.
271
272 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000273 # assert iselement(element)
Eli Bendersky84fae782013-03-09 07:12:48 -0800274 self._children.remove(subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000275
276 def getchildren(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800277 """(Deprecated) Return all subelements.
278
279 Elements are returned in document order.
280
281 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000282 warnings.warn(
283 "This method will be removed in future versions. "
284 "Use 'list(elem)' or iteration over elem instead.",
285 DeprecationWarning, stacklevel=2
286 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000287 return self._children
288
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000289 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800290 """Find first matching element by tag name or path.
291
292 *path* is a string having either an element tag or an XPath,
293 *namespaces* is an optional mapping from namespace prefix to full name.
294
295 Return the first matching element, or None if no element was found.
296
297 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000298 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000299
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000300 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800301 """Find text for first matching element by tag name or path.
302
303 *path* is a string having either an element tag or an XPath,
304 *default* is the value to return if the element was not found,
305 *namespaces* is an optional mapping from namespace prefix to full name.
306
307 Return text content of first matching element, or default value if
308 none was found. Note that if an element is found having no text
309 content, the empty string is returned.
310
311 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000312 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000313
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000314 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800315 """Find all matching subelements by tag name or path.
316
317 *path* is a string having either an element tag or an XPath,
318 *namespaces* is an optional mapping from namespace prefix to full name.
319
320 Returns list containing all matching elements in document order.
321
322 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000323 return ElementPath.findall(self, path, namespaces)
324
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000325 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800326 """Find all matching subelements by tag name or path.
327
328 *path* is a string having either an element tag or an XPath,
329 *namespaces* is an optional mapping from namespace prefix to full name.
330
331 Return an iterable yielding all matching elements in document order.
332
333 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000334 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000335
Armin Rigo9ed73062005-12-14 18:10:45 +0000336 def clear(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800337 """Reset element.
338
339 This function removes all subelements, clears all attributes, and sets
340 the text and tail attributes to None.
341
342 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000343 self.attrib.clear()
344 self._children = []
345 self.text = self.tail = None
346
Armin Rigo9ed73062005-12-14 18:10:45 +0000347 def get(self, key, default=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800348 """Get element attribute.
349
350 Equivalent to attrib.get, but some implementations may handle this a
351 bit more efficiently. *key* is what attribute to look for, and
352 *default* is what to return if the attribute was not found.
353
354 Returns a string containing the attribute value, or the default if
355 attribute was not found.
356
357 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000358 return self.attrib.get(key, default)
359
Armin Rigo9ed73062005-12-14 18:10:45 +0000360 def set(self, key, value):
Eli Bendersky84fae782013-03-09 07:12:48 -0800361 """Set element attribute.
362
363 Equivalent to attrib[key] = value, but some implementations may handle
364 this a bit more efficiently. *key* is what attribute to set, and
365 *value* is the attribute value to set it to.
366
367 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000368 self.attrib[key] = value
369
Armin Rigo9ed73062005-12-14 18:10:45 +0000370 def keys(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800371 """Get list of attribute names.
372
373 Names are returned in an arbitrary order, just like an ordinary
374 Python dict. Equivalent to attrib.keys()
375
376 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000377 return self.attrib.keys()
378
Armin Rigo9ed73062005-12-14 18:10:45 +0000379 def items(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800380 """Get element attributes as a sequence.
381
382 The attributes are returned in arbitrary order. Equivalent to
383 attrib.items().
384
385 Return a list of (name, value) tuples.
386
387 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000388 return self.attrib.items()
389
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000390 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800391 """Create tree iterator.
392
393 The iterator loops over the element and all subelements in document
394 order, returning all elements with a matching tag.
395
396 If the tree structure is modified during iteration, new or removed
397 elements may or may not be included. To get a stable set, use the
398 list() function on the iterator, and loop over the resulting list.
399
400 *tag* is what tags to look for (default is to return all elements)
401
402 Return an iterator containing all the matching elements.
403
404 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000405 if tag == "*":
406 tag = None
407 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000408 yield self
409 for e in self._children:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700410 yield from e.iter(tag)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000411
412 # compatibility
413 def getiterator(self, tag=None):
414 # Change for a DeprecationWarning in 1.4
415 warnings.warn(
416 "This method will be removed in future versions. "
417 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
418 PendingDeprecationWarning, stacklevel=2
419 )
420 return list(self.iter(tag))
421
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000422 def itertext(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800423 """Create text iterator.
424
425 The iterator loops over the element and all subelements in document
426 order, returning all inner text.
427
428 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000429 tag = self.tag
430 if not isinstance(tag, str) and tag is not None:
431 return
Serhiy Storchaka66c08d92015-12-21 11:09:48 +0200432 t = self.text
433 if t:
434 yield t
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000435 for e in self:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700436 yield from e.itertext()
Serhiy Storchaka66c08d92015-12-21 11:09:48 +0200437 t = e.tail
438 if t:
439 yield t
Armin Rigo9ed73062005-12-14 18:10:45 +0000440
Armin Rigo9ed73062005-12-14 18:10:45 +0000441
442def SubElement(parent, tag, attrib={}, **extra):
Eli Bendersky84fae782013-03-09 07:12:48 -0800443 """Subelement factory which creates an element instance, and appends it
444 to an existing parent.
445
446 The element tag, attribute names, and attribute values can be either
447 bytes or Unicode strings.
448
449 *parent* is the parent element, *tag* is the subelements name, *attrib* is
450 an optional directory containing element attributes, *extra* are
451 additional attributes given as keyword arguments.
452
453 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000454 attrib = attrib.copy()
455 attrib.update(extra)
456 element = parent.makeelement(tag, attrib)
457 parent.append(element)
458 return element
459
Armin Rigo9ed73062005-12-14 18:10:45 +0000460
461def Comment(text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800462 """Comment element factory.
463
464 This function creates a special element which the standard serializer
465 serializes as an XML comment.
466
467 *text* is a string containing the comment string.
468
469 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000470 element = Element(Comment)
471 element.text = text
472 return element
473
Armin Rigo9ed73062005-12-14 18:10:45 +0000474
475def ProcessingInstruction(target, text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800476 """Processing Instruction element factory.
477
478 This function creates a special element which the standard serializer
479 serializes as an XML comment.
480
481 *target* is a string containing the processing instruction, *text* is a
482 string containing the processing instruction contents, if any.
483
484 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000485 element = Element(ProcessingInstruction)
486 element.text = target
487 if text:
488 element.text = element.text + " " + text
489 return element
490
491PI = ProcessingInstruction
492
Armin Rigo9ed73062005-12-14 18:10:45 +0000493
494class QName:
Eli Bendersky84fae782013-03-09 07:12:48 -0800495 """Qualified name wrapper.
496
497 This class can be used to wrap a QName attribute value in order to get
498 proper namespace handing on output.
499
500 *text_or_uri* is a string containing the QName value either in the form
501 {uri}local, or if the tag argument is given, the URI part of a QName.
502
503 *tag* is an optional argument which if given, will make the first
504 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
505 be interpreted as a local name.
506
507 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000508 def __init__(self, text_or_uri, tag=None):
509 if tag:
510 text_or_uri = "{%s}%s" % (text_or_uri, tag)
511 self.text = text_or_uri
512 def __str__(self):
513 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000514 def __repr__(self):
Serhiy Storchaka465e60e2014-07-25 23:36:00 +0300515 return '<%s %r>' % (self.__class__.__name__, self.text)
Armin Rigo9ed73062005-12-14 18:10:45 +0000516 def __hash__(self):
517 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000518 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000519 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000520 return self.text <= other.text
521 return self.text <= other
522 def __lt__(self, other):
523 if isinstance(other, QName):
524 return self.text < other.text
525 return self.text < other
526 def __ge__(self, other):
527 if isinstance(other, QName):
528 return self.text >= other.text
529 return self.text >= other
530 def __gt__(self, other):
531 if isinstance(other, QName):
532 return self.text > other.text
533 return self.text > other
534 def __eq__(self, other):
535 if isinstance(other, QName):
536 return self.text == other.text
537 return self.text == other
Armin Rigo9ed73062005-12-14 18:10:45 +0000538
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000539# --------------------------------------------------------------------
540
Armin Rigo9ed73062005-12-14 18:10:45 +0000541
542class ElementTree:
Eli Bendersky84fae782013-03-09 07:12:48 -0800543 """An XML element hierarchy.
Armin Rigo9ed73062005-12-14 18:10:45 +0000544
Eli Bendersky84fae782013-03-09 07:12:48 -0800545 This class also provides support for serialization to and from
546 standard XML.
547
548 *element* is an optional root element node,
549 *file* is an optional file handle or file name of an XML file whose
550 contents will be used to initialize the tree with.
551
552 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000553 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000554 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000555 self._root = element # first node
556 if file:
557 self.parse(file)
558
Armin Rigo9ed73062005-12-14 18:10:45 +0000559 def getroot(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800560 """Return root element of this tree."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000561 return self._root
562
Armin Rigo9ed73062005-12-14 18:10:45 +0000563 def _setroot(self, element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800564 """Replace root element of this tree.
565
566 This will discard the current contents of the tree and replace it
567 with the given element. Use with care!
568
569 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000570 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000571 self._root = element
572
Armin Rigo9ed73062005-12-14 18:10:45 +0000573 def parse(self, source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800574 """Load external XML document into element tree.
575
576 *source* is a file name or file object, *parser* is an optional parser
577 instance that defaults to XMLParser.
578
579 ParseError is raised if the parser fails to parse the document.
580
581 Returns the root element of the given source document.
582
583 """
Antoine Pitroue033e062010-10-29 10:38:18 +0000584 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000585 if not hasattr(source, "read"):
586 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000587 close_source = True
588 try:
Eli Benderskya3699232013-05-19 18:47:23 -0700589 if parser is None:
590 # If no parser was specified, create a default XMLParser
591 parser = XMLParser()
592 if hasattr(parser, '_parse_whole'):
593 # The default XMLParser, when it comes from an accelerator,
594 # can define an internal _parse_whole API for efficiency.
595 # It can be used to parse the whole source without feeding
596 # it with chunks.
597 self._root = parser._parse_whole(source)
598 return self._root
599 while True:
Antoine Pitroue033e062010-10-29 10:38:18 +0000600 data = source.read(65536)
601 if not data:
602 break
603 parser.feed(data)
604 self._root = parser.close()
605 return self._root
606 finally:
607 if close_source:
608 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000609
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000610 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800611 """Create and return tree iterator for the root element.
612
613 The iterator loops over all elements in this tree, in document order.
614
615 *tag* is a string with the tag name to iterate over
616 (default is to return all elements).
617
618 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000619 # assert self._root is not None
620 return self._root.iter(tag)
621
622 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000623 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000624 # Change for a DeprecationWarning in 1.4
625 warnings.warn(
626 "This method will be removed in future versions. "
627 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
628 PendingDeprecationWarning, stacklevel=2
629 )
630 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000631
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000632 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800633 """Find first matching element by tag name or path.
634
635 Same as getroot().find(path), which is Element.find()
636
637 *path* is a string having either an element tag or an XPath,
638 *namespaces* is an optional mapping from namespace prefix to full name.
639
640 Return the first matching element, or None if no element was found.
641
642 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000643 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000644 if path[:1] == "/":
645 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000646 warnings.warn(
647 "This search is broken in 1.3 and earlier, and will be "
648 "fixed in a future version. If you rely on the current "
649 "behaviour, change it to %r" % path,
650 FutureWarning, stacklevel=2
651 )
652 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000653
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000654 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800655 """Find first matching element by tag name or path.
656
657 Same as getroot().findtext(path), which is Element.findtext()
658
659 *path* is a string having either an element tag or an XPath,
660 *namespaces* is an optional mapping from namespace prefix to full name.
661
662 Return the first matching element, or None if no element was found.
663
664 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000665 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000666 if path[:1] == "/":
667 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000668 warnings.warn(
669 "This search is broken in 1.3 and earlier, and will be "
670 "fixed in a future version. If you rely on the current "
671 "behaviour, change it to %r" % path,
672 FutureWarning, stacklevel=2
673 )
674 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000675
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000676 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800677 """Find all matching subelements by tag name or path.
678
679 Same as getroot().findall(path), which is Element.findall().
680
681 *path* is a string having either an element tag or an XPath,
682 *namespaces* is an optional mapping from namespace prefix to full name.
683
684 Return list containing all matching elements in document order.
685
686 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000687 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000688 if path[:1] == "/":
689 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000690 warnings.warn(
691 "This search is broken in 1.3 and earlier, and will be "
692 "fixed in a future version. If you rely on the current "
693 "behaviour, change it to %r" % path,
694 FutureWarning, stacklevel=2
695 )
696 return self._root.findall(path, namespaces)
697
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000698 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800699 """Find all matching subelements by tag name or path.
700
701 Same as getroot().iterfind(path), which is element.iterfind()
702
703 *path* is a string having either an element tag or an XPath,
704 *namespaces* is an optional mapping from namespace prefix to full name.
705
706 Return an iterable yielding all matching elements in document order.
707
708 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000709 # assert self._root is not None
710 if path[:1] == "/":
711 path = "." + path
712 warnings.warn(
713 "This search is broken in 1.3 and earlier, and will be "
714 "fixed in a future version. If you rely on the current "
715 "behaviour, change it to %r" % path,
716 FutureWarning, stacklevel=2
717 )
718 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000719
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000720 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000721 encoding=None,
722 xml_declaration=None,
723 default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800724 method=None, *,
725 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -0800726 """Write element tree to a file as XML.
727
728 Arguments:
729 *file_or_filename* -- file name or a file object opened for writing
730
731 *encoding* -- the output encoding (default: US-ASCII)
732
733 *xml_declaration* -- bool indicating if an XML declaration should be
734 added to the output. If None, an XML declaration
735 is added if encoding IS NOT either of:
736 US-ASCII, UTF-8, or Unicode
737
738 *default_namespace* -- sets the default XML namespace (for "xmlns")
739
740 *method* -- either "xml" (default), "html, "text", or "c14n"
741
742 *short_empty_elements* -- controls the formatting of elements
743 that contain no content. If True (default)
744 they are emitted as a single self-closed
745 tag, otherwise they are emitted as a pair
746 of start/end tags
Eli Benderskye9af8272013-01-13 06:27:51 -0800747
748 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000749 if not method:
750 method = "xml"
751 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000752 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000753 if not encoding:
754 if method == "c14n":
755 encoding = "utf-8"
756 else:
757 encoding = "us-ascii"
Martin Panter89f76d32015-09-23 01:14:35 +0000758 enc_lower = encoding.lower()
759 with _get_writer(file_or_filename, enc_lower) as write:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300760 if method == "xml" and (xml_declaration or
761 (xml_declaration is None and
Martin Panter89f76d32015-09-23 01:14:35 +0000762 enc_lower not in ("utf-8", "us-ascii", "unicode"))):
Eli Bendersky00f402b2012-07-15 06:02:22 +0300763 declared_encoding = encoding
Martin Panter89f76d32015-09-23 01:14:35 +0000764 if enc_lower == "unicode":
Eli Bendersky00f402b2012-07-15 06:02:22 +0300765 # Retrieve the default encoding for the xml declaration
766 import locale
767 declared_encoding = locale.getpreferredencoding()
768 write("<?xml version='1.0' encoding='%s'?>\n" % (
769 declared_encoding,))
770 if method == "text":
771 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000772 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300773 qnames, namespaces = _namespaces(self._root, default_namespace)
774 serialize = _serialize[method]
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800775 serialize(write, self._root, qnames, namespaces,
776 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000777
778 def write_c14n(self, file):
779 # lxml.etree compatibility. use output method instead
780 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000781
782# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000783# serialization support
784
Eli Bendersky00f402b2012-07-15 06:02:22 +0300785@contextlib.contextmanager
786def _get_writer(file_or_filename, encoding):
Ezio Melottib5bc3532013-08-17 16:11:40 +0300787 # returns text write method and release all resources after using
Eli Bendersky00f402b2012-07-15 06:02:22 +0300788 try:
789 write = file_or_filename.write
790 except AttributeError:
791 # file_or_filename is a file name
792 if encoding == "unicode":
793 file = open(file_or_filename, "w")
794 else:
795 file = open(file_or_filename, "w", encoding=encoding,
796 errors="xmlcharrefreplace")
797 with file:
798 yield file.write
799 else:
800 # file_or_filename is a file-like object
801 # encoding determines if it is a text or binary writer
802 if encoding == "unicode":
803 # use a text writer as is
804 yield write
805 else:
806 # wrap a binary writer with TextIOWrapper
807 with contextlib.ExitStack() as stack:
808 if isinstance(file_or_filename, io.BufferedIOBase):
809 file = file_or_filename
810 elif isinstance(file_or_filename, io.RawIOBase):
811 file = io.BufferedWriter(file_or_filename)
812 # Keep the original file open when the BufferedWriter is
813 # destroyed
814 stack.callback(file.detach)
815 else:
816 # This is to handle passed objects that aren't in the
817 # IOBase hierarchy, but just have a write method
818 file = io.BufferedIOBase()
819 file.writable = lambda: True
820 file.write = write
821 try:
822 # TextIOWrapper uses this methods to determine
823 # if BOM (for UTF-16, etc) should be added
824 file.seekable = file_or_filename.seekable
825 file.tell = file_or_filename.tell
826 except AttributeError:
827 pass
828 file = io.TextIOWrapper(file,
829 encoding=encoding,
830 errors="xmlcharrefreplace",
831 newline="\n")
832 # Keep the original file open when the TextIOWrapper is
833 # destroyed
834 stack.callback(file.detach)
835 yield file.write
836
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000837def _namespaces(elem, default_namespace=None):
838 # identify namespaces used in this tree
839
840 # maps qnames to *encoded* prefix:local names
841 qnames = {None: None}
842
843 # maps uri:s to prefixes
844 namespaces = {}
845 if default_namespace:
846 namespaces[default_namespace] = ""
847
848 def add_qname(qname):
849 # calculate serialized qname representation
850 try:
851 if qname[:1] == "{":
852 uri, tag = qname[1:].rsplit("}", 1)
853 prefix = namespaces.get(uri)
854 if prefix is None:
855 prefix = _namespace_map.get(uri)
856 if prefix is None:
857 prefix = "ns%d" % len(namespaces)
858 if prefix != "xml":
859 namespaces[uri] = prefix
860 if prefix:
861 qnames[qname] = "%s:%s" % (prefix, tag)
862 else:
863 qnames[qname] = tag # default element
864 else:
865 if default_namespace:
866 # FIXME: can this be handled in XML 1.0?
867 raise ValueError(
868 "cannot use non-qualified names with "
869 "default_namespace option"
870 )
871 qnames[qname] = qname
872 except TypeError:
873 _raise_serialization_error(qname)
874
875 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300876 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000877 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000878 if isinstance(tag, QName):
879 if tag.text not in qnames:
880 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000881 elif isinstance(tag, str):
882 if tag not in qnames:
883 add_qname(tag)
884 elif tag is not None and tag is not Comment and tag is not PI:
885 _raise_serialization_error(tag)
886 for key, value in elem.items():
887 if isinstance(key, QName):
888 key = key.text
889 if key not in qnames:
890 add_qname(key)
891 if isinstance(value, QName) and value.text not in qnames:
892 add_qname(value.text)
893 text = elem.text
894 if isinstance(text, QName) and text.text not in qnames:
895 add_qname(text.text)
896 return qnames, namespaces
897
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800898def _serialize_xml(write, elem, qnames, namespaces,
899 short_empty_elements, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000900 tag = elem.tag
901 text = elem.text
902 if tag is Comment:
903 write("<!--%s-->" % text)
904 elif tag is ProcessingInstruction:
905 write("<?%s?>" % text)
906 else:
907 tag = qnames[tag]
908 if tag is None:
909 if text:
910 write(_escape_cdata(text))
911 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800912 _serialize_xml(write, e, qnames, None,
913 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000914 else:
915 write("<" + tag)
916 items = list(elem.items())
917 if items or namespaces:
918 if namespaces:
919 for v, k in sorted(namespaces.items(),
920 key=lambda x: x[1]): # sort on prefix
921 if k:
922 k = ":" + k
923 write(" xmlns%s=\"%s\"" % (
924 k,
925 _escape_attrib(v)
926 ))
927 for k, v in sorted(items): # lexical order
928 if isinstance(k, QName):
929 k = k.text
930 if isinstance(v, QName):
931 v = qnames[v.text]
932 else:
933 v = _escape_attrib(v)
934 write(" %s=\"%s\"" % (qnames[k], v))
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800935 if text or len(elem) or not short_empty_elements:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000936 write(">")
937 if text:
938 write(_escape_cdata(text))
939 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800940 _serialize_xml(write, e, qnames, None,
941 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000942 write("</" + tag + ">")
943 else:
944 write(" />")
945 if elem.tail:
946 write(_escape_cdata(elem.tail))
947
948HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +0300949 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000950
951try:
952 HTML_EMPTY = set(HTML_EMPTY)
953except NameError:
954 pass
955
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800956def _serialize_html(write, elem, qnames, namespaces, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000957 tag = elem.tag
958 text = elem.text
959 if tag is Comment:
960 write("<!--%s-->" % _escape_cdata(text))
961 elif tag is ProcessingInstruction:
962 write("<?%s?>" % _escape_cdata(text))
963 else:
964 tag = qnames[tag]
965 if tag is None:
966 if text:
967 write(_escape_cdata(text))
968 for e in elem:
969 _serialize_html(write, e, qnames, None)
970 else:
971 write("<" + tag)
972 items = list(elem.items())
973 if items or namespaces:
974 if namespaces:
975 for v, k in sorted(namespaces.items(),
976 key=lambda x: x[1]): # sort on prefix
977 if k:
978 k = ":" + k
979 write(" xmlns%s=\"%s\"" % (
980 k,
981 _escape_attrib(v)
982 ))
983 for k, v in sorted(items): # lexical order
984 if isinstance(k, QName):
985 k = k.text
986 if isinstance(v, QName):
987 v = qnames[v.text]
988 else:
989 v = _escape_attrib_html(v)
990 # FIXME: handle boolean attributes
991 write(" %s=\"%s\"" % (qnames[k], v))
992 write(">")
Christian Heimes54ad7e32013-07-05 01:39:49 +0200993 ltag = tag.lower()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000994 if text:
Christian Heimes54ad7e32013-07-05 01:39:49 +0200995 if ltag == "script" or ltag == "style":
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000996 write(text)
997 else:
998 write(_escape_cdata(text))
999 for e in elem:
1000 _serialize_html(write, e, qnames, None)
Christian Heimes54ad7e32013-07-05 01:39:49 +02001001 if ltag not in HTML_EMPTY:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001002 write("</" + tag + ">")
1003 if elem.tail:
1004 write(_escape_cdata(elem.tail))
1005
1006def _serialize_text(write, elem):
1007 for part in elem.itertext():
1008 write(part)
1009 if elem.tail:
1010 write(elem.tail)
1011
1012_serialize = {
1013 "xml": _serialize_xml,
1014 "html": _serialize_html,
1015 "text": _serialize_text,
1016# this optional method is imported at the end of the module
1017# "c14n": _serialize_c14n,
1018}
Armin Rigo9ed73062005-12-14 18:10:45 +00001019
Armin Rigo9ed73062005-12-14 18:10:45 +00001020
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001021def register_namespace(prefix, uri):
Eli Bendersky84fae782013-03-09 07:12:48 -08001022 """Register a namespace prefix.
1023
1024 The registry is global, and any existing mapping for either the
1025 given prefix or the namespace URI will be removed.
1026
1027 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1028 attributes in this namespace will be serialized with prefix if possible.
1029
1030 ValueError is raised if prefix is reserved or is invalid.
1031
1032 """
R David Murray44b548d2016-09-08 13:59:53 -04001033 if re.match(r"ns\d+$", prefix):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001034 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001035 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001036 if k == uri or v == prefix:
1037 del _namespace_map[k]
1038 _namespace_map[uri] = prefix
1039
1040_namespace_map = {
1041 # "well-known" namespace prefixes
1042 "http://www.w3.org/XML/1998/namespace": "xml",
1043 "http://www.w3.org/1999/xhtml": "html",
1044 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1045 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1046 # xml schema
1047 "http://www.w3.org/2001/XMLSchema": "xs",
1048 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1049 # dublin core
1050 "http://purl.org/dc/elements/1.1/": "dc",
1051}
Florent Xicluna16395052012-02-16 23:28:35 +01001052# For tests and troubleshooting
1053register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001054
1055def _raise_serialization_error(text):
1056 raise TypeError(
1057 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1058 )
1059
1060def _escape_cdata(text):
1061 # escape character data
1062 try:
1063 # it's worth avoiding do-nothing calls for strings that are
1064 # shorter than 500 character, or so. assume that's, by far,
1065 # the most common case in most applications.
1066 if "&" in text:
1067 text = text.replace("&", "&amp;")
1068 if "<" in text:
1069 text = text.replace("<", "&lt;")
1070 if ">" in text:
1071 text = text.replace(">", "&gt;")
1072 return text
1073 except (TypeError, AttributeError):
1074 _raise_serialization_error(text)
1075
1076def _escape_attrib(text):
1077 # escape attribute value
1078 try:
1079 if "&" in text:
1080 text = text.replace("&", "&amp;")
1081 if "<" in text:
1082 text = text.replace("<", "&lt;")
1083 if ">" in text:
1084 text = text.replace(">", "&gt;")
1085 if "\"" in text:
1086 text = text.replace("\"", "&quot;")
Raymond Hettinger076366c2016-09-11 23:18:03 -07001087 # The following business with carriage returns is to satisfy
Raymond Hettinger11fa3ff2016-09-11 23:23:24 -07001088 # Section 2.11 of the XML specification, stating that
Raymond Hettinger076366c2016-09-11 23:18:03 -07001089 # CR or CR LN should be replaced with just LN
1090 # http://www.w3.org/TR/REC-xml/#sec-line-ends
1091 if "\r\n" in text:
1092 text = text.replace("\r\n", "\n")
1093 if "\r" in text:
1094 text = text.replace("\r", "\n")
1095 #The following four lines are issue 17582
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001096 if "\n" in text:
1097 text = text.replace("\n", "&#10;")
Raymond Hettinger076366c2016-09-11 23:18:03 -07001098 if "\t" in text:
1099 text = text.replace("\t", "&#09;")
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001100 return text
1101 except (TypeError, AttributeError):
1102 _raise_serialization_error(text)
1103
1104def _escape_attrib_html(text):
1105 # escape attribute value
1106 try:
1107 if "&" in text:
1108 text = text.replace("&", "&amp;")
1109 if ">" in text:
1110 text = text.replace(">", "&gt;")
1111 if "\"" in text:
1112 text = text.replace("\"", "&quot;")
1113 return text
1114 except (TypeError, AttributeError):
1115 _raise_serialization_error(text)
1116
1117# --------------------------------------------------------------------
1118
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001119def tostring(element, encoding=None, method=None, *,
1120 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -08001121 """Generate string representation of XML element.
1122
1123 All subelements are included. If encoding is "unicode", a string
1124 is returned. Otherwise a bytestring is returned.
1125
1126 *element* is an Element instance, *encoding* is an optional output
1127 encoding defaulting to US-ASCII, *method* is an optional output which can
1128 be one of "xml" (default), "html", "text" or "c14n".
1129
1130 Returns an (optionally) encoded string containing the XML data.
1131
1132 """
Eli Bendersky00f402b2012-07-15 06:02:22 +03001133 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001134 ElementTree(element).write(stream, encoding, method=method,
1135 short_empty_elements=short_empty_elements)
Eli Bendersky00f402b2012-07-15 06:02:22 +03001136 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001137
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001138class _ListDataStream(io.BufferedIOBase):
Eli Bendersky84fae782013-03-09 07:12:48 -08001139 """An auxiliary stream accumulating into a list reference."""
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001140 def __init__(self, lst):
1141 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001142
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001143 def writable(self):
1144 return True
1145
1146 def seekable(self):
1147 return True
1148
1149 def write(self, b):
1150 self.lst.append(b)
1151
1152 def tell(self):
1153 return len(self.lst)
1154
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001155def tostringlist(element, encoding=None, method=None, *,
1156 short_empty_elements=True):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001157 lst = []
1158 stream = _ListDataStream(lst)
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001159 ElementTree(element).write(stream, encoding, method=method,
1160 short_empty_elements=short_empty_elements)
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001161 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001162
Armin Rigo9ed73062005-12-14 18:10:45 +00001163
1164def dump(elem):
Eli Bendersky84fae782013-03-09 07:12:48 -08001165 """Write element tree or element structure to sys.stdout.
1166
1167 This function should be used for debugging only.
1168
1169 *elem* is either an ElementTree, or a single Element. The exact output
1170 format is implementation dependent. In this version, it's written as an
1171 ordinary XML file.
1172
1173 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001174 # debugging
1175 if not isinstance(elem, ElementTree):
1176 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001177 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001178 tail = elem.getroot().tail
1179 if not tail or tail[-1] != "\n":
1180 sys.stdout.write("\n")
1181
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001182# --------------------------------------------------------------------
1183# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001184
Armin Rigo9ed73062005-12-14 18:10:45 +00001185
1186def parse(source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001187 """Parse XML document into element tree.
1188
1189 *source* is a filename or file object containing XML data,
1190 *parser* is an optional parser instance defaulting to XMLParser.
1191
1192 Return an ElementTree instance.
1193
1194 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001195 tree = ElementTree()
1196 tree.parse(source, parser)
1197 return tree
1198
Armin Rigo9ed73062005-12-14 18:10:45 +00001199
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001200def iterparse(source, events=None, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001201 """Incrementally parse XML document into ElementTree.
1202
1203 This class also reports what's going on to the user based on the
1204 *events* it is initialized with. The supported events are the strings
1205 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1206 detailed namespace information). If *events* is omitted, only
1207 "end" events are reported.
1208
1209 *source* is a filename or file object containing XML data, *events* is
1210 a list of events to report back, *parser* is an optional parser instance.
1211
1212 Returns an iterator providing (event, elem) pairs.
1213
1214 """
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001215 # Use the internal, undocumented _parser argument for now; When the
1216 # parser argument of iterparse is removed, this can be killed.
1217 pullparser = XMLPullParser(events=events, _parser=parser)
1218 def iterator():
1219 try:
1220 while True:
1221 yield from pullparser.read_events()
1222 # load event buffer
1223 data = source.read(16 * 1024)
1224 if not data:
1225 break
1226 pullparser.feed(data)
1227 root = pullparser._close_and_return_root()
1228 yield from pullparser.read_events()
1229 it.root = root
1230 finally:
1231 if close_source:
1232 source.close()
1233
1234 class IterParseIterator(collections.Iterator):
1235 __next__ = iterator().__next__
1236 it = IterParseIterator()
1237 it.root = None
1238 del iterator, IterParseIterator
1239
Antoine Pitroue033e062010-10-29 10:38:18 +00001240 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001241 if not hasattr(source, "read"):
1242 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001243 close_source = True
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001244
1245 return it
Armin Rigo9ed73062005-12-14 18:10:45 +00001246
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001247
Eli Benderskyb5869342013-08-30 05:51:20 -07001248class XMLPullParser:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001249
Eli Benderskyb5869342013-08-30 05:51:20 -07001250 def __init__(self, events=None, *, _parser=None):
1251 # The _parser argument is for internal use only and must not be relied
1252 # upon in user code. It will be removed in a future release.
1253 # See http://bugs.python.org/issue17741 for more details.
1254
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001255 self._events_queue = collections.deque()
Eli Benderskyb5869342013-08-30 05:51:20 -07001256 self._parser = _parser or XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001257 # wire up the parser for event reporting
Armin Rigo9ed73062005-12-14 18:10:45 +00001258 if events is None:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001259 events = ("end",)
1260 self._parser._setevents(self._events_queue, events)
1261
Eli Benderskyb5869342013-08-30 05:51:20 -07001262 def feed(self, data):
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001263 """Feed encoded data to parser."""
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001264 if self._parser is None:
Eli Benderskyb5869342013-08-30 05:51:20 -07001265 raise ValueError("feed() called after end of stream")
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001266 if data:
1267 try:
1268 self._parser.feed(data)
1269 except SyntaxError as exc:
1270 self._events_queue.append(exc)
1271
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001272 def _close_and_return_root(self):
1273 # iterparse needs this to set its root attribute properly :(
1274 root = self._parser.close()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001275 self._parser = None
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001276 return root
1277
1278 def close(self):
1279 """Finish feeding data to parser.
1280
1281 Unlike XMLParser, does not return the root element. Use
1282 read_events() to consume elements from XMLPullParser.
1283 """
1284 self._close_and_return_root()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001285
Eli Benderskyb5869342013-08-30 05:51:20 -07001286 def read_events(self):
R David Murray410d3202014-01-04 23:52:50 -05001287 """Return an iterator over currently available (event, elem) pairs.
Nick Coghlan4cc2afa2013-09-28 23:50:35 +10001288
1289 Events are consumed from the internal event queue as they are
1290 retrieved from the iterator.
1291 """
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001292 events = self._events_queue
Serhiy Storchaka9ec5e252015-12-07 02:31:11 +02001293 while events:
1294 event = events.popleft()
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001295 if isinstance(event, Exception):
1296 raise event
1297 else:
1298 yield event
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001299
1300
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001301def XML(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001302 """Parse XML document from string constant.
1303
1304 This function can be used to embed "XML Literals" in Python code.
1305
1306 *text* is a string containing XML data, *parser* is an
1307 optional parser instance, defaulting to the standard XMLParser.
1308
1309 Returns an Element instance.
1310
1311 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001312 if not parser:
1313 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001314 parser.feed(text)
1315 return parser.close()
1316
Armin Rigo9ed73062005-12-14 18:10:45 +00001317
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001318def XMLID(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001319 """Parse XML document from string constant for its IDs.
1320
1321 *text* is a string containing XML data, *parser* is an
1322 optional parser instance, defaulting to the standard XMLParser.
1323
1324 Returns an (Element, dict) tuple, in which the
1325 dict maps element id:s to elements.
1326
1327 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001328 if not parser:
1329 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001330 parser.feed(text)
1331 tree = parser.close()
1332 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001333 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001334 id = elem.get("id")
1335 if id:
1336 ids[id] = elem
1337 return tree, ids
1338
Victor Stinner765531d2013-03-26 01:11:54 +01001339# Parse XML document from string constant. Alias for XML().
Armin Rigo9ed73062005-12-14 18:10:45 +00001340fromstring = XML
Armin Rigo9ed73062005-12-14 18:10:45 +00001341
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001342def fromstringlist(sequence, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001343 """Parse XML document from sequence of string fragments.
1344
1345 *sequence* is a list of other sequence, *parser* is an optional parser
1346 instance, defaulting to the standard XMLParser.
1347
1348 Returns an Element instance.
1349
1350 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001351 if not parser:
1352 parser = XMLParser(target=TreeBuilder())
1353 for text in sequence:
1354 parser.feed(text)
1355 return parser.close()
1356
1357# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001358
Armin Rigo9ed73062005-12-14 18:10:45 +00001359
1360class TreeBuilder:
Eli Bendersky84fae782013-03-09 07:12:48 -08001361 """Generic element structure builder.
Armin Rigo9ed73062005-12-14 18:10:45 +00001362
Eli Bendersky84fae782013-03-09 07:12:48 -08001363 This builder converts a sequence of start, data, and end method
1364 calls to a well-formed element structure.
1365
1366 You can use this class to build an element structure using a custom XML
1367 parser, or a parser for some other XML-like format.
1368
1369 *element_factory* is an optional element factory which is called
1370 to create new Element instances, as necessary.
1371
1372 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001373 def __init__(self, element_factory=None):
1374 self._data = [] # data collector
1375 self._elem = [] # element stack
1376 self._last = None # last element
1377 self._tail = None # true if we're after an end tag
1378 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001379 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001380 self._factory = element_factory
1381
Armin Rigo9ed73062005-12-14 18:10:45 +00001382 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001383 """Flush builder buffers and return toplevel document Element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001384 assert len(self._elem) == 0, "missing end tags"
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001385 assert self._last is not None, "missing toplevel element"
Armin Rigo9ed73062005-12-14 18:10:45 +00001386 return self._last
1387
1388 def _flush(self):
1389 if self._data:
1390 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001391 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001392 if self._tail:
1393 assert self._last.tail is None, "internal error (tail)"
1394 self._last.tail = text
1395 else:
1396 assert self._last.text is None, "internal error (text)"
1397 self._last.text = text
1398 self._data = []
1399
Armin Rigo9ed73062005-12-14 18:10:45 +00001400 def data(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001401 """Add text to current element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001402 self._data.append(data)
1403
Armin Rigo9ed73062005-12-14 18:10:45 +00001404 def start(self, tag, attrs):
Eli Bendersky84fae782013-03-09 07:12:48 -08001405 """Open new element and return it.
1406
1407 *tag* is the element name, *attrs* is a dict containing element
1408 attributes.
1409
1410 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001411 self._flush()
1412 self._last = elem = self._factory(tag, attrs)
1413 if self._elem:
1414 self._elem[-1].append(elem)
1415 self._elem.append(elem)
1416 self._tail = 0
1417 return elem
1418
Armin Rigo9ed73062005-12-14 18:10:45 +00001419 def end(self, tag):
Eli Bendersky84fae782013-03-09 07:12:48 -08001420 """Close and return current Element.
1421
1422 *tag* is the element name.
1423
1424 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001425 self._flush()
1426 self._last = self._elem.pop()
1427 assert self._last.tag == tag,\
1428 "end tag mismatch (expected %s, got %s)" % (
1429 self._last.tag, tag)
1430 self._tail = 1
1431 return self._last
1432
Armin Rigo9ed73062005-12-14 18:10:45 +00001433
Eli Bendersky84fae782013-03-09 07:12:48 -08001434# also see ElementTree and TreeBuilder
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001435class XMLParser:
Eli Bendersky84fae782013-03-09 07:12:48 -08001436 """Element structure builder for XML source data based on the expat parser.
1437
Martin Panter29ce0822016-06-04 07:12:51 +00001438 *html* are predefined HTML entities (deprecated and not supported),
Eli Bendersky84fae782013-03-09 07:12:48 -08001439 *target* is an optional target object which defaults to an instance of the
1440 standard TreeBuilder class, *encoding* is an optional encoding string
1441 which if given, overrides the encoding specified in the XML file:
1442 http://www.iana.org/assignments/character-sets
1443
1444 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001445
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001446 def __init__(self, html=0, target=None, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +00001447 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001448 from xml.parsers import expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001449 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001450 try:
1451 import pyexpat as expat
Brett Cannoncd171c82013-07-04 17:43:24 -04001452 except ImportError:
1453 raise ImportError(
1454 "No module named expat; use SimpleXMLTreeBuilder instead"
1455 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001456 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001457 if target is None:
1458 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001459 # underscored names are provided for compatibility only
1460 self.parser = self._parser = parser
1461 self.target = self._target = target
1462 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001463 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001464 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001465 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001466 if hasattr(target, 'start'):
1467 parser.StartElementHandler = self._start
1468 if hasattr(target, 'end'):
1469 parser.EndElementHandler = self._end
1470 if hasattr(target, 'data'):
1471 parser.CharacterDataHandler = target.data
1472 # miscellaneous callbacks
1473 if hasattr(target, 'comment'):
1474 parser.CommentHandler = target.comment
1475 if hasattr(target, 'pi'):
1476 parser.ProcessingInstructionHandler = target.pi
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001477 # Configure pyexpat: buffering, new-style attribute handling.
1478 parser.buffer_text = 1
1479 parser.ordered_attributes = 1
1480 parser.specified_attributes = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001481 self._doctype = None
1482 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001483 try:
1484 self.version = "Expat %d.%d.%d" % expat.version_info
1485 except AttributeError:
1486 pass # unknown
1487
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001488 def _setevents(self, events_queue, events_to_report):
Eli Benderskyb5869342013-08-30 05:51:20 -07001489 # Internal API for XMLPullParser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001490 # events_to_report: a list of events to report during parsing (same as
Eli Benderskyb5869342013-08-30 05:51:20 -07001491 # the *events* of XMLPullParser's constructor.
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001492 # events_queue: a list of actual parsing events that will be populated
1493 # by the underlying parser.
1494 #
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001495 parser = self._parser
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001496 append = events_queue.append
1497 for event_name in events_to_report:
1498 if event_name == "start":
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001499 parser.ordered_attributes = 1
1500 parser.specified_attributes = 1
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001501 def handler(tag, attrib_in, event=event_name, append=append,
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001502 start=self._start):
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001503 append((event, start(tag, attrib_in)))
1504 parser.StartElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001505 elif event_name == "end":
1506 def handler(tag, event=event_name, append=append,
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001507 end=self._end):
1508 append((event, end(tag)))
1509 parser.EndElementHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001510 elif event_name == "start-ns":
1511 def handler(prefix, uri, event=event_name, append=append):
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001512 append((event, (prefix or "", uri or "")))
1513 parser.StartNamespaceDeclHandler = handler
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001514 elif event_name == "end-ns":
1515 def handler(prefix, event=event_name, append=append):
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001516 append((event, None))
1517 parser.EndNamespaceDeclHandler = handler
1518 else:
Eli Bendersky3a4fbd82013-05-19 09:01:49 -07001519 raise ValueError("unknown event %r" % event_name)
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001520
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001521 def _raiseerror(self, value):
1522 err = ParseError(value)
1523 err.code = value.code
1524 err.position = value.lineno, value.offset
1525 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001526
Armin Rigo9ed73062005-12-14 18:10:45 +00001527 def _fixname(self, key):
1528 # expand qname, and convert name string to ascii, if possible
1529 try:
1530 name = self._names[key]
1531 except KeyError:
1532 name = key
1533 if "}" in name:
1534 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001535 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001536 return name
1537
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001538 def _start(self, tag, attr_list):
1539 # Handler for expat's StartElementHandler. Since ordered_attributes
1540 # is set, the attributes are reported as a list of alternating
1541 # attribute name,value.
Armin Rigo9ed73062005-12-14 18:10:45 +00001542 fixname = self._fixname
1543 tag = fixname(tag)
1544 attrib = {}
Eli Bendersky6206a7e2013-08-25 18:58:18 -07001545 if attr_list:
1546 for i in range(0, len(attr_list), 2):
1547 attrib[fixname(attr_list[i])] = attr_list[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001548 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001549
Armin Rigo9ed73062005-12-14 18:10:45 +00001550 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001551 return self.target.end(self._fixname(tag))
1552
Armin Rigo9ed73062005-12-14 18:10:45 +00001553 def _default(self, text):
1554 prefix = text[:1]
1555 if prefix == "&":
1556 # deal with undefined entities
1557 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001558 data_handler = self.target.data
1559 except AttributeError:
1560 return
1561 try:
1562 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001563 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001564 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001565 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001566 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001567 (text, self.parser.ErrorLineNumber,
1568 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001569 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001570 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001571 err.lineno = self.parser.ErrorLineNumber
1572 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001573 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001574 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1575 self._doctype = [] # inside a doctype declaration
1576 elif self._doctype is not None:
1577 # parse doctype contents
1578 if prefix == ">":
1579 self._doctype = None
1580 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001581 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001582 if not text:
1583 return
1584 self._doctype.append(text)
1585 n = len(self._doctype)
1586 if n > 2:
1587 type = self._doctype[1]
1588 if type == "PUBLIC" and n == 4:
1589 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001590 if pubid:
1591 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001592 elif type == "SYSTEM" and n == 3:
1593 name, type, system = self._doctype
1594 pubid = None
1595 else:
1596 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001597 if hasattr(self.target, "doctype"):
1598 self.target.doctype(name, pubid, system[1:-1])
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001599 elif self.doctype != self._XMLParser__doctype:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001600 # warn about deprecated call
1601 self._XMLParser__doctype(name, pubid, system[1:-1])
1602 self.doctype(name, pubid, system[1:-1])
Armin Rigo9ed73062005-12-14 18:10:45 +00001603 self._doctype = None
1604
Armin Rigo9ed73062005-12-14 18:10:45 +00001605 def doctype(self, name, pubid, system):
Eli Bendersky84fae782013-03-09 07:12:48 -08001606 """(Deprecated) Handle doctype declaration
1607
1608 *name* is the Doctype name, *pubid* is the public identifier,
1609 and *system* is the system identifier.
1610
1611 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001612 warnings.warn(
1613 "This method of XMLParser is deprecated. Define doctype() "
1614 "method on the TreeBuilder target.",
1615 DeprecationWarning,
1616 )
1617
1618 # sentinel, if doctype is redefined in a subclass
1619 __doctype = doctype
Armin Rigo9ed73062005-12-14 18:10:45 +00001620
Armin Rigo9ed73062005-12-14 18:10:45 +00001621 def feed(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001622 """Feed encoded data to parser."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001623 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001624 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001625 except self._error as v:
1626 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001627
Armin Rigo9ed73062005-12-14 18:10:45 +00001628 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001629 """Finish feeding data to parser and return element structure."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001630 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001631 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001632 except self._error as v:
1633 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001634 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001635 close_handler = self.target.close
1636 except AttributeError:
1637 pass
1638 else:
1639 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001640 finally:
1641 # get rid of circular references
1642 del self.parser, self._parser
1643 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001644
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001645
1646# Import the C accelerators
1647try:
Eli Bendersky46955b22013-05-19 09:20:50 -07001648 # Element is going to be shadowed by the C implementation. We need to keep
1649 # the Python version of it accessible for some "creative" by external code
1650 # (see tests)
1651 _Element_Py = Element
1652
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001653 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1654 from _elementtree import *
Eli Benderskyc4e98a62013-05-19 09:24:43 -07001655except ImportError:
1656 pass