blob: b56ddff64cbdcbfe1c55b3a17e90e229750fa427 [file] [log] [blame]
Eli Bendersky84fae782013-03-09 07:12:48 -08001"""Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34"""
35
Eli Benderskybf05df22013-04-20 05:44:01 -070036#---------------------------------------------------------------------
37# Licensed to PSF under a Contributor Agreement.
38# See http://www.python.org/psf/license for licensing details.
Armin Rigo9ed73062005-12-14 18:10:45 +000039#
40# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +000041# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000042#
43# fredrik@pythonware.com
44# http://www.pythonware.com
Armin Rigo9ed73062005-12-14 18:10:45 +000045# --------------------------------------------------------------------
46# The ElementTree toolkit is
47#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000048# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000049#
50# By obtaining, using, and/or copying this software and/or its
51# associated documentation, you agree that you have read, understood,
52# and will comply with the following terms and conditions:
53#
54# Permission to use, copy, modify, and distribute this software and
55# its associated documentation for any purpose and without fee is
56# hereby granted, provided that the above copyright notice appears in
57# all copies, and that both that copyright notice and this permission
58# notice appear in supporting documentation, and that the name of
59# Secret Labs AB or the author not be used in advertising or publicity
60# pertaining to distribution of the software without specific, written
61# prior permission.
62#
63# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70# OF THIS SOFTWARE.
71# --------------------------------------------------------------------
72
73__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000078 "fromstring", "fromstringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000079 "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000080 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000081 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000084 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000085 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000086 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010087 "XML", "XMLID",
Thomas Wouters0e3f5912006-08-11 14:57:12 +000088 "XMLParser", "XMLTreeBuilder",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010089 "register_namespace",
Armin Rigo9ed73062005-12-14 18:10:45 +000090 ]
91
Florent Xiclunaf15351d2010-03-13 23:24:31 +000092VERSION = "1.3.0"
93
Florent Xiclunaf15351d2010-03-13 23:24:31 +000094import sys
95import re
96import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +030097import io
98import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +000099
Eli Bendersky27cbb192012-06-15 09:03:19 +0300100from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000101
Armin Rigo9ed73062005-12-14 18:10:45 +0000102
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000103class ParseError(SyntaxError):
Eli Bendersky84fae782013-03-09 07:12:48 -0800104 """An error when parsing an XML document.
105
106 In addition to its exception value, a ParseError contains
107 two extra attributes:
108 'code' - the specific exception code
109 'position' - the line and column of the error
110
111 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000112 pass
113
114# --------------------------------------------------------------------
115
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000116
117def iselement(element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800118 """Return True if *element* appears to be an Element."""
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100119 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000120
Armin Rigo9ed73062005-12-14 18:10:45 +0000121
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000122class Element:
Eli Bendersky84fae782013-03-09 07:12:48 -0800123 """An XML element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000124
Eli Bendersky84fae782013-03-09 07:12:48 -0800125 This class is the reference implementation of the Element interface.
126
127 An element's length is its number of subelements. That means if you
128 you want to check if an element is truly empty, you should check BOTH
129 its length AND its text attribute.
130
131 The element tag, attribute names, and attribute values can be either
132 bytes or strings.
133
134 *tag* is the element name. *attrib* is an optional dictionary containing
135 element attributes. *extra* are additional element attributes given as
136 keyword arguments.
137
138 Example form:
139 <tag attrib>text<child/>...</tag>tail
140
141 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000142
143 tag = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800144 """The element's name."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000145
146 attrib = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800147 """Dictionary of the element's attributes."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000148
149 text = None
Eli Bendersky84fae782013-03-09 07:12:48 -0800150 """
151 Text before first subelement. This is either a string or the value None.
152 Note that if there is no text, this attribute may be either
153 None or the empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000154
Eli Bendersky84fae782013-03-09 07:12:48 -0800155 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000156
Eli Bendersky84fae782013-03-09 07:12:48 -0800157 tail = None
158 """
159 Text after this element's end tag, but before the next sibling element's
160 start tag. This is either a string or the value None. Note that if there
161 was no text, this attribute may be either None or an empty string,
162 depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000163
Eli Bendersky84fae782013-03-09 07:12:48 -0800164 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000165
166 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300167 if not isinstance(attrib, dict):
168 raise TypeError("attrib must be dict, not %s" % (
169 attrib.__class__.__name__,))
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000170 attrib = attrib.copy()
171 attrib.update(extra)
Armin Rigo9ed73062005-12-14 18:10:45 +0000172 self.tag = tag
173 self.attrib = attrib
174 self._children = []
175
176 def __repr__(self):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000177 return "<Element %s at 0x%x>" % (repr(self.tag), id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000178
Armin Rigo9ed73062005-12-14 18:10:45 +0000179 def makeelement(self, tag, attrib):
Eli Bendersky84fae782013-03-09 07:12:48 -0800180 """Create a new element with the same type.
181
182 *tag* is a string containing the element name.
183 *attrib* is a dictionary containing the element attributes.
184
185 Do not call this method, use the SubElement factory function instead.
186
187 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000188 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000189
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000190 def copy(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800191 """Return copy of current element.
192
193 This creates a shallow copy. Subelements will be shared with the
194 original tree.
195
196 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000197 elem = self.makeelement(self.tag, self.attrib)
198 elem.text = self.text
199 elem.tail = self.tail
200 elem[:] = self
201 return elem
202
Armin Rigo9ed73062005-12-14 18:10:45 +0000203 def __len__(self):
204 return len(self._children)
205
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000206 def __bool__(self):
207 warnings.warn(
208 "The behavior of this method will change in future versions. "
209 "Use specific 'len(elem)' or 'elem is not None' test instead.",
210 FutureWarning, stacklevel=2
211 )
212 return len(self._children) != 0 # emulate old behaviour, for now
213
Armin Rigo9ed73062005-12-14 18:10:45 +0000214 def __getitem__(self, index):
215 return self._children[index]
216
Armin Rigo9ed73062005-12-14 18:10:45 +0000217 def __setitem__(self, index, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000218 # if isinstance(index, slice):
219 # for elt in element:
220 # assert iselement(elt)
221 # else:
222 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000223 self._children[index] = element
224
Armin Rigo9ed73062005-12-14 18:10:45 +0000225 def __delitem__(self, index):
226 del self._children[index]
227
Eli Bendersky84fae782013-03-09 07:12:48 -0800228 def append(self, subelement):
229 """Add *subelement* to the end of this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000230
Eli Bendersky84fae782013-03-09 07:12:48 -0800231 The new element will appear in document order after the last existing
232 subelement (or directly after the text, if it's the first subelement),
233 but before the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000234
Eli Bendersky84fae782013-03-09 07:12:48 -0800235 """
236 self._assert_is_element(subelement)
237 self._children.append(subelement)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000238
239 def extend(self, elements):
Eli Bendersky84fae782013-03-09 07:12:48 -0800240 """Append subelements from a sequence.
241
242 *elements* is a sequence with zero or more elements.
243
244 """
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200245 for element in elements:
246 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000247 self._children.extend(elements)
248
Armin Rigo9ed73062005-12-14 18:10:45 +0000249
Eli Bendersky84fae782013-03-09 07:12:48 -0800250 def insert(self, index, subelement):
251 """Insert *subelement* at position *index*."""
252 self._assert_is_element(subelement)
253 self._children.insert(index, subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000254
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200255 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200256 # Need to refer to the actual Python implementation, not the
257 # shadowing C implementation.
258 if not isinstance(e, _Element):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200259 raise TypeError('expected an Element, not %s' % type(e).__name__)
260
Armin Rigo9ed73062005-12-14 18:10:45 +0000261
Eli Bendersky84fae782013-03-09 07:12:48 -0800262 def remove(self, subelement):
263 """Remove matching subelement.
264
265 Unlike the find methods, this method compares elements based on
266 identity, NOT ON tag value or contents. To remove subelements by
267 other means, the easiest way is to use a list comprehension to
268 select what elements to keep, and then use slice assignment to update
269 the parent element.
270
271 ValueError is raised if a matching element could not be found.
272
273 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000274 # assert iselement(element)
Eli Bendersky84fae782013-03-09 07:12:48 -0800275 self._children.remove(subelement)
Armin Rigo9ed73062005-12-14 18:10:45 +0000276
277 def getchildren(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800278 """(Deprecated) Return all subelements.
279
280 Elements are returned in document order.
281
282 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000283 warnings.warn(
284 "This method will be removed in future versions. "
285 "Use 'list(elem)' or iteration over elem instead.",
286 DeprecationWarning, stacklevel=2
287 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000288 return self._children
289
Armin Rigo9ed73062005-12-14 18:10:45 +0000290
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000291 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800292 """Find first matching element by tag name or path.
293
294 *path* is a string having either an element tag or an XPath,
295 *namespaces* is an optional mapping from namespace prefix to full name.
296
297 Return the first matching element, or None if no element was found.
298
299 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000300 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000301
Armin Rigo9ed73062005-12-14 18:10:45 +0000302
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000303 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800304 """Find text for first matching element by tag name or path.
305
306 *path* is a string having either an element tag or an XPath,
307 *default* is the value to return if the element was not found,
308 *namespaces* is an optional mapping from namespace prefix to full name.
309
310 Return text content of first matching element, or default value if
311 none was found. Note that if an element is found having no text
312 content, the empty string is returned.
313
314 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000315 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000316
Armin Rigo9ed73062005-12-14 18:10:45 +0000317
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000318 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800319 """Find all matching subelements by tag name or path.
320
321 *path* is a string having either an element tag or an XPath,
322 *namespaces* is an optional mapping from namespace prefix to full name.
323
324 Returns list containing all matching elements in document order.
325
326 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000327 return ElementPath.findall(self, path, namespaces)
328
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000329
330 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800331 """Find all matching subelements by tag name or path.
332
333 *path* is a string having either an element tag or an XPath,
334 *namespaces* is an optional mapping from namespace prefix to full name.
335
336 Return an iterable yielding all matching elements in document order.
337
338 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000339 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000340
Armin Rigo9ed73062005-12-14 18:10:45 +0000341
342 def clear(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800343 """Reset element.
344
345 This function removes all subelements, clears all attributes, and sets
346 the text and tail attributes to None.
347
348 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000349 self.attrib.clear()
350 self._children = []
351 self.text = self.tail = None
352
Armin Rigo9ed73062005-12-14 18:10:45 +0000353
354 def get(self, key, default=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800355 """Get element attribute.
356
357 Equivalent to attrib.get, but some implementations may handle this a
358 bit more efficiently. *key* is what attribute to look for, and
359 *default* is what to return if the attribute was not found.
360
361 Returns a string containing the attribute value, or the default if
362 attribute was not found.
363
364 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000365 return self.attrib.get(key, default)
366
Armin Rigo9ed73062005-12-14 18:10:45 +0000367
368 def set(self, key, value):
Eli Bendersky84fae782013-03-09 07:12:48 -0800369 """Set element attribute.
370
371 Equivalent to attrib[key] = value, but some implementations may handle
372 this a bit more efficiently. *key* is what attribute to set, and
373 *value* is the attribute value to set it to.
374
375 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000376 self.attrib[key] = value
377
Armin Rigo9ed73062005-12-14 18:10:45 +0000378
379 def keys(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800380 """Get list of attribute names.
381
382 Names are returned in an arbitrary order, just like an ordinary
383 Python dict. Equivalent to attrib.keys()
384
385 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000386 return self.attrib.keys()
387
Armin Rigo9ed73062005-12-14 18:10:45 +0000388
389 def items(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800390 """Get element attributes as a sequence.
391
392 The attributes are returned in arbitrary order. Equivalent to
393 attrib.items().
394
395 Return a list of (name, value) tuples.
396
397 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000398 return self.attrib.items()
399
Armin Rigo9ed73062005-12-14 18:10:45 +0000400
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000401 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800402 """Create tree iterator.
403
404 The iterator loops over the element and all subelements in document
405 order, returning all elements with a matching tag.
406
407 If the tree structure is modified during iteration, new or removed
408 elements may or may not be included. To get a stable set, use the
409 list() function on the iterator, and loop over the resulting list.
410
411 *tag* is what tags to look for (default is to return all elements)
412
413 Return an iterator containing all the matching elements.
414
415 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000416 if tag == "*":
417 tag = None
418 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000419 yield self
420 for e in self._children:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700421 yield from e.iter(tag)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000422
423 # compatibility
424 def getiterator(self, tag=None):
425 # Change for a DeprecationWarning in 1.4
426 warnings.warn(
427 "This method will be removed in future versions. "
428 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
429 PendingDeprecationWarning, stacklevel=2
430 )
431 return list(self.iter(tag))
432
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000433
434 def itertext(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800435 """Create text iterator.
436
437 The iterator loops over the element and all subelements in document
438 order, returning all inner text.
439
440 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000441 tag = self.tag
442 if not isinstance(tag, str) and tag is not None:
443 return
444 if self.text:
445 yield self.text
446 for e in self:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700447 yield from e.itertext()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000448 if e.tail:
449 yield e.tail
Armin Rigo9ed73062005-12-14 18:10:45 +0000450
451# compatibility
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000452_Element = _ElementInterface = Element
Armin Rigo9ed73062005-12-14 18:10:45 +0000453
Armin Rigo9ed73062005-12-14 18:10:45 +0000454
455def SubElement(parent, tag, attrib={}, **extra):
Eli Bendersky84fae782013-03-09 07:12:48 -0800456 """Subelement factory which creates an element instance, and appends it
457 to an existing parent.
458
459 The element tag, attribute names, and attribute values can be either
460 bytes or Unicode strings.
461
462 *parent* is the parent element, *tag* is the subelements name, *attrib* is
463 an optional directory containing element attributes, *extra* are
464 additional attributes given as keyword arguments.
465
466 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000467 attrib = attrib.copy()
468 attrib.update(extra)
469 element = parent.makeelement(tag, attrib)
470 parent.append(element)
471 return element
472
Armin Rigo9ed73062005-12-14 18:10:45 +0000473
474def Comment(text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800475 """Comment element factory.
476
477 This function creates a special element which the standard serializer
478 serializes as an XML comment.
479
480 *text* is a string containing the comment string.
481
482 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000483 element = Element(Comment)
484 element.text = text
485 return element
486
Armin Rigo9ed73062005-12-14 18:10:45 +0000487
488def ProcessingInstruction(target, text=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800489 """Processing Instruction element factory.
490
491 This function creates a special element which the standard serializer
492 serializes as an XML comment.
493
494 *target* is a string containing the processing instruction, *text* is a
495 string containing the processing instruction contents, if any.
496
497 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000498 element = Element(ProcessingInstruction)
499 element.text = target
500 if text:
501 element.text = element.text + " " + text
502 return element
503
504PI = ProcessingInstruction
505
Armin Rigo9ed73062005-12-14 18:10:45 +0000506
507class QName:
Eli Bendersky84fae782013-03-09 07:12:48 -0800508 """Qualified name wrapper.
509
510 This class can be used to wrap a QName attribute value in order to get
511 proper namespace handing on output.
512
513 *text_or_uri* is a string containing the QName value either in the form
514 {uri}local, or if the tag argument is given, the URI part of a QName.
515
516 *tag* is an optional argument which if given, will make the first
517 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
518 be interpreted as a local name.
519
520 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000521 def __init__(self, text_or_uri, tag=None):
522 if tag:
523 text_or_uri = "{%s}%s" % (text_or_uri, tag)
524 self.text = text_or_uri
525 def __str__(self):
526 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000527 def __repr__(self):
Georg Brandlc95c9182010-12-09 18:26:02 +0000528 return '<QName %r>' % (self.text,)
Armin Rigo9ed73062005-12-14 18:10:45 +0000529 def __hash__(self):
530 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000531 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000532 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000533 return self.text <= other.text
534 return self.text <= other
535 def __lt__(self, other):
536 if isinstance(other, QName):
537 return self.text < other.text
538 return self.text < other
539 def __ge__(self, other):
540 if isinstance(other, QName):
541 return self.text >= other.text
542 return self.text >= other
543 def __gt__(self, other):
544 if isinstance(other, QName):
545 return self.text > other.text
546 return self.text > other
547 def __eq__(self, other):
548 if isinstance(other, QName):
549 return self.text == other.text
550 return self.text == other
551 def __ne__(self, other):
552 if isinstance(other, QName):
553 return self.text != other.text
554 return self.text != other
Armin Rigo9ed73062005-12-14 18:10:45 +0000555
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000556# --------------------------------------------------------------------
557
Armin Rigo9ed73062005-12-14 18:10:45 +0000558
559class ElementTree:
Eli Bendersky84fae782013-03-09 07:12:48 -0800560 """An XML element hierarchy.
Armin Rigo9ed73062005-12-14 18:10:45 +0000561
Eli Bendersky84fae782013-03-09 07:12:48 -0800562 This class also provides support for serialization to and from
563 standard XML.
564
565 *element* is an optional root element node,
566 *file* is an optional file handle or file name of an XML file whose
567 contents will be used to initialize the tree with.
568
569 """
Armin Rigo9ed73062005-12-14 18:10:45 +0000570 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000571 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000572 self._root = element # first node
573 if file:
574 self.parse(file)
575
Armin Rigo9ed73062005-12-14 18:10:45 +0000576 def getroot(self):
Eli Bendersky84fae782013-03-09 07:12:48 -0800577 """Return root element of this tree."""
Armin Rigo9ed73062005-12-14 18:10:45 +0000578 return self._root
579
Armin Rigo9ed73062005-12-14 18:10:45 +0000580 def _setroot(self, element):
Eli Bendersky84fae782013-03-09 07:12:48 -0800581 """Replace root element of this tree.
582
583 This will discard the current contents of the tree and replace it
584 with the given element. Use with care!
585
586 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000587 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000588 self._root = element
589
Armin Rigo9ed73062005-12-14 18:10:45 +0000590 def parse(self, source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800591 """Load external XML document into element tree.
592
593 *source* is a file name or file object, *parser* is an optional parser
594 instance that defaults to XMLParser.
595
596 ParseError is raised if the parser fails to parse the document.
597
598 Returns the root element of the given source document.
599
600 """
Antoine Pitroue033e062010-10-29 10:38:18 +0000601 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000602 if not hasattr(source, "read"):
603 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000604 close_source = True
605 try:
606 if not parser:
607 parser = XMLParser(target=TreeBuilder())
608 while 1:
609 data = source.read(65536)
610 if not data:
611 break
612 parser.feed(data)
613 self._root = parser.close()
614 return self._root
615 finally:
616 if close_source:
617 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000618
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000619 def iter(self, tag=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800620 """Create and return tree iterator for the root element.
621
622 The iterator loops over all elements in this tree, in document order.
623
624 *tag* is a string with the tag name to iterate over
625 (default is to return all elements).
626
627 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000628 # assert self._root is not None
629 return self._root.iter(tag)
630
631 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000632 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000633 # Change for a DeprecationWarning in 1.4
634 warnings.warn(
635 "This method will be removed in future versions. "
636 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
637 PendingDeprecationWarning, stacklevel=2
638 )
639 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000640
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000641 def find(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800642 """Find first matching element by tag name or path.
643
644 Same as getroot().find(path), which is Element.find()
645
646 *path* is a string having either an element tag or an XPath,
647 *namespaces* is an optional mapping from namespace prefix to full name.
648
649 Return the first matching element, or None if no element was found.
650
651 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000652 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000653 if path[:1] == "/":
654 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000655 warnings.warn(
656 "This search is broken in 1.3 and earlier, and will be "
657 "fixed in a future version. If you rely on the current "
658 "behaviour, change it to %r" % path,
659 FutureWarning, stacklevel=2
660 )
661 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000662
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000663 def findtext(self, path, default=None, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800664 """Find first matching element by tag name or path.
665
666 Same as getroot().findtext(path), which is Element.findtext()
667
668 *path* is a string having either an element tag or an XPath,
669 *namespaces* is an optional mapping from namespace prefix to full name.
670
671 Return the first matching element, or None if no element was found.
672
673 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000674 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000675 if path[:1] == "/":
676 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000677 warnings.warn(
678 "This search is broken in 1.3 and earlier, and will be "
679 "fixed in a future version. If you rely on the current "
680 "behaviour, change it to %r" % path,
681 FutureWarning, stacklevel=2
682 )
683 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000684
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000685 def findall(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800686 """Find all matching subelements by tag name or path.
687
688 Same as getroot().findall(path), which is Element.findall().
689
690 *path* is a string having either an element tag or an XPath,
691 *namespaces* is an optional mapping from namespace prefix to full name.
692
693 Return list containing all matching elements in document order.
694
695 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000696 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000697 if path[:1] == "/":
698 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000699 warnings.warn(
700 "This search is broken in 1.3 and earlier, and will be "
701 "fixed in a future version. If you rely on the current "
702 "behaviour, change it to %r" % path,
703 FutureWarning, stacklevel=2
704 )
705 return self._root.findall(path, namespaces)
706
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000707 def iterfind(self, path, namespaces=None):
Eli Bendersky84fae782013-03-09 07:12:48 -0800708 """Find all matching subelements by tag name or path.
709
710 Same as getroot().iterfind(path), which is element.iterfind()
711
712 *path* is a string having either an element tag or an XPath,
713 *namespaces* is an optional mapping from namespace prefix to full name.
714
715 Return an iterable yielding all matching elements in document order.
716
717 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000718 # assert self._root is not None
719 if path[:1] == "/":
720 path = "." + path
721 warnings.warn(
722 "This search is broken in 1.3 and earlier, and will be "
723 "fixed in a future version. If you rely on the current "
724 "behaviour, change it to %r" % path,
725 FutureWarning, stacklevel=2
726 )
727 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000728
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000729 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000730 encoding=None,
731 xml_declaration=None,
732 default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800733 method=None, *,
734 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -0800735 """Write element tree to a file as XML.
736
737 Arguments:
738 *file_or_filename* -- file name or a file object opened for writing
739
740 *encoding* -- the output encoding (default: US-ASCII)
741
742 *xml_declaration* -- bool indicating if an XML declaration should be
743 added to the output. If None, an XML declaration
744 is added if encoding IS NOT either of:
745 US-ASCII, UTF-8, or Unicode
746
747 *default_namespace* -- sets the default XML namespace (for "xmlns")
748
749 *method* -- either "xml" (default), "html, "text", or "c14n"
750
751 *short_empty_elements* -- controls the formatting of elements
752 that contain no content. If True (default)
753 they are emitted as a single self-closed
754 tag, otherwise they are emitted as a pair
755 of start/end tags
Eli Benderskye9af8272013-01-13 06:27:51 -0800756
757 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000758 if not method:
759 method = "xml"
760 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000761 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000762 if not encoding:
763 if method == "c14n":
764 encoding = "utf-8"
765 else:
766 encoding = "us-ascii"
Florent Xiclunac17f1722010-08-08 19:48:29 +0000767 else:
768 encoding = encoding.lower()
Eli Bendersky00f402b2012-07-15 06:02:22 +0300769 with _get_writer(file_or_filename, encoding) as write:
770 if method == "xml" and (xml_declaration or
771 (xml_declaration is None and
772 encoding not in ("utf-8", "us-ascii", "unicode"))):
773 declared_encoding = encoding
774 if encoding == "unicode":
775 # Retrieve the default encoding for the xml declaration
776 import locale
777 declared_encoding = locale.getpreferredencoding()
778 write("<?xml version='1.0' encoding='%s'?>\n" % (
779 declared_encoding,))
780 if method == "text":
781 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000782 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300783 qnames, namespaces = _namespaces(self._root, default_namespace)
784 serialize = _serialize[method]
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800785 serialize(write, self._root, qnames, namespaces,
786 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000787
788 def write_c14n(self, file):
789 # lxml.etree compatibility. use output method instead
790 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000791
792# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000793# serialization support
794
Eli Bendersky00f402b2012-07-15 06:02:22 +0300795@contextlib.contextmanager
796def _get_writer(file_or_filename, encoding):
797 # returns text write method and release all resourses after using
798 try:
799 write = file_or_filename.write
800 except AttributeError:
801 # file_or_filename is a file name
802 if encoding == "unicode":
803 file = open(file_or_filename, "w")
804 else:
805 file = open(file_or_filename, "w", encoding=encoding,
806 errors="xmlcharrefreplace")
807 with file:
808 yield file.write
809 else:
810 # file_or_filename is a file-like object
811 # encoding determines if it is a text or binary writer
812 if encoding == "unicode":
813 # use a text writer as is
814 yield write
815 else:
816 # wrap a binary writer with TextIOWrapper
817 with contextlib.ExitStack() as stack:
818 if isinstance(file_or_filename, io.BufferedIOBase):
819 file = file_or_filename
820 elif isinstance(file_or_filename, io.RawIOBase):
821 file = io.BufferedWriter(file_or_filename)
822 # Keep the original file open when the BufferedWriter is
823 # destroyed
824 stack.callback(file.detach)
825 else:
826 # This is to handle passed objects that aren't in the
827 # IOBase hierarchy, but just have a write method
828 file = io.BufferedIOBase()
829 file.writable = lambda: True
830 file.write = write
831 try:
832 # TextIOWrapper uses this methods to determine
833 # if BOM (for UTF-16, etc) should be added
834 file.seekable = file_or_filename.seekable
835 file.tell = file_or_filename.tell
836 except AttributeError:
837 pass
838 file = io.TextIOWrapper(file,
839 encoding=encoding,
840 errors="xmlcharrefreplace",
841 newline="\n")
842 # Keep the original file open when the TextIOWrapper is
843 # destroyed
844 stack.callback(file.detach)
845 yield file.write
846
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000847def _namespaces(elem, default_namespace=None):
848 # identify namespaces used in this tree
849
850 # maps qnames to *encoded* prefix:local names
851 qnames = {None: None}
852
853 # maps uri:s to prefixes
854 namespaces = {}
855 if default_namespace:
856 namespaces[default_namespace] = ""
857
858 def add_qname(qname):
859 # calculate serialized qname representation
860 try:
861 if qname[:1] == "{":
862 uri, tag = qname[1:].rsplit("}", 1)
863 prefix = namespaces.get(uri)
864 if prefix is None:
865 prefix = _namespace_map.get(uri)
866 if prefix is None:
867 prefix = "ns%d" % len(namespaces)
868 if prefix != "xml":
869 namespaces[uri] = prefix
870 if prefix:
871 qnames[qname] = "%s:%s" % (prefix, tag)
872 else:
873 qnames[qname] = tag # default element
874 else:
875 if default_namespace:
876 # FIXME: can this be handled in XML 1.0?
877 raise ValueError(
878 "cannot use non-qualified names with "
879 "default_namespace option"
880 )
881 qnames[qname] = qname
882 except TypeError:
883 _raise_serialization_error(qname)
884
885 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300886 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000887 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000888 if isinstance(tag, QName):
889 if tag.text not in qnames:
890 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000891 elif isinstance(tag, str):
892 if tag not in qnames:
893 add_qname(tag)
894 elif tag is not None and tag is not Comment and tag is not PI:
895 _raise_serialization_error(tag)
896 for key, value in elem.items():
897 if isinstance(key, QName):
898 key = key.text
899 if key not in qnames:
900 add_qname(key)
901 if isinstance(value, QName) and value.text not in qnames:
902 add_qname(value.text)
903 text = elem.text
904 if isinstance(text, QName) and text.text not in qnames:
905 add_qname(text.text)
906 return qnames, namespaces
907
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800908def _serialize_xml(write, elem, qnames, namespaces,
909 short_empty_elements, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000910 tag = elem.tag
911 text = elem.text
912 if tag is Comment:
913 write("<!--%s-->" % text)
914 elif tag is ProcessingInstruction:
915 write("<?%s?>" % text)
916 else:
917 tag = qnames[tag]
918 if tag is None:
919 if text:
920 write(_escape_cdata(text))
921 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800922 _serialize_xml(write, e, qnames, None,
923 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000924 else:
925 write("<" + tag)
926 items = list(elem.items())
927 if items or namespaces:
928 if namespaces:
929 for v, k in sorted(namespaces.items(),
930 key=lambda x: x[1]): # sort on prefix
931 if k:
932 k = ":" + k
933 write(" xmlns%s=\"%s\"" % (
934 k,
935 _escape_attrib(v)
936 ))
937 for k, v in sorted(items): # lexical order
938 if isinstance(k, QName):
939 k = k.text
940 if isinstance(v, QName):
941 v = qnames[v.text]
942 else:
943 v = _escape_attrib(v)
944 write(" %s=\"%s\"" % (qnames[k], v))
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800945 if text or len(elem) or not short_empty_elements:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000946 write(">")
947 if text:
948 write(_escape_cdata(text))
949 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800950 _serialize_xml(write, e, qnames, None,
951 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000952 write("</" + tag + ">")
953 else:
954 write(" />")
955 if elem.tail:
956 write(_escape_cdata(elem.tail))
957
958HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +0300959 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000960
961try:
962 HTML_EMPTY = set(HTML_EMPTY)
963except NameError:
964 pass
965
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800966def _serialize_html(write, elem, qnames, namespaces, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000967 tag = elem.tag
968 text = elem.text
969 if tag is Comment:
970 write("<!--%s-->" % _escape_cdata(text))
971 elif tag is ProcessingInstruction:
972 write("<?%s?>" % _escape_cdata(text))
973 else:
974 tag = qnames[tag]
975 if tag is None:
976 if text:
977 write(_escape_cdata(text))
978 for e in elem:
979 _serialize_html(write, e, qnames, None)
980 else:
981 write("<" + tag)
982 items = list(elem.items())
983 if items or namespaces:
984 if namespaces:
985 for v, k in sorted(namespaces.items(),
986 key=lambda x: x[1]): # sort on prefix
987 if k:
988 k = ":" + k
989 write(" xmlns%s=\"%s\"" % (
990 k,
991 _escape_attrib(v)
992 ))
993 for k, v in sorted(items): # lexical order
994 if isinstance(k, QName):
995 k = k.text
996 if isinstance(v, QName):
997 v = qnames[v.text]
998 else:
999 v = _escape_attrib_html(v)
1000 # FIXME: handle boolean attributes
1001 write(" %s=\"%s\"" % (qnames[k], v))
1002 write(">")
1003 tag = tag.lower()
1004 if text:
1005 if tag == "script" or tag == "style":
1006 write(text)
1007 else:
1008 write(_escape_cdata(text))
1009 for e in elem:
1010 _serialize_html(write, e, qnames, None)
1011 if tag not in HTML_EMPTY:
1012 write("</" + tag + ">")
1013 if elem.tail:
1014 write(_escape_cdata(elem.tail))
1015
1016def _serialize_text(write, elem):
1017 for part in elem.itertext():
1018 write(part)
1019 if elem.tail:
1020 write(elem.tail)
1021
1022_serialize = {
1023 "xml": _serialize_xml,
1024 "html": _serialize_html,
1025 "text": _serialize_text,
1026# this optional method is imported at the end of the module
1027# "c14n": _serialize_c14n,
1028}
Armin Rigo9ed73062005-12-14 18:10:45 +00001029
Armin Rigo9ed73062005-12-14 18:10:45 +00001030
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001031def register_namespace(prefix, uri):
Eli Bendersky84fae782013-03-09 07:12:48 -08001032 """Register a namespace prefix.
1033
1034 The registry is global, and any existing mapping for either the
1035 given prefix or the namespace URI will be removed.
1036
1037 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
1038 attributes in this namespace will be serialized with prefix if possible.
1039
1040 ValueError is raised if prefix is reserved or is invalid.
1041
1042 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001043 if re.match("ns\d+$", prefix):
1044 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001045 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001046 if k == uri or v == prefix:
1047 del _namespace_map[k]
1048 _namespace_map[uri] = prefix
1049
1050_namespace_map = {
1051 # "well-known" namespace prefixes
1052 "http://www.w3.org/XML/1998/namespace": "xml",
1053 "http://www.w3.org/1999/xhtml": "html",
1054 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1055 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1056 # xml schema
1057 "http://www.w3.org/2001/XMLSchema": "xs",
1058 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1059 # dublin core
1060 "http://purl.org/dc/elements/1.1/": "dc",
1061}
Florent Xicluna16395052012-02-16 23:28:35 +01001062# For tests and troubleshooting
1063register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001064
1065def _raise_serialization_error(text):
1066 raise TypeError(
1067 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1068 )
1069
1070def _escape_cdata(text):
1071 # escape character data
1072 try:
1073 # it's worth avoiding do-nothing calls for strings that are
1074 # shorter than 500 character, or so. assume that's, by far,
1075 # the most common case in most applications.
1076 if "&" in text:
1077 text = text.replace("&", "&amp;")
1078 if "<" in text:
1079 text = text.replace("<", "&lt;")
1080 if ">" in text:
1081 text = text.replace(">", "&gt;")
1082 return text
1083 except (TypeError, AttributeError):
1084 _raise_serialization_error(text)
1085
1086def _escape_attrib(text):
1087 # escape attribute value
1088 try:
1089 if "&" in text:
1090 text = text.replace("&", "&amp;")
1091 if "<" in text:
1092 text = text.replace("<", "&lt;")
1093 if ">" in text:
1094 text = text.replace(">", "&gt;")
1095 if "\"" in text:
1096 text = text.replace("\"", "&quot;")
1097 if "\n" in text:
1098 text = text.replace("\n", "&#10;")
1099 return text
1100 except (TypeError, AttributeError):
1101 _raise_serialization_error(text)
1102
1103def _escape_attrib_html(text):
1104 # escape attribute value
1105 try:
1106 if "&" in text:
1107 text = text.replace("&", "&amp;")
1108 if ">" in text:
1109 text = text.replace(">", "&gt;")
1110 if "\"" in text:
1111 text = text.replace("\"", "&quot;")
1112 return text
1113 except (TypeError, AttributeError):
1114 _raise_serialization_error(text)
1115
1116# --------------------------------------------------------------------
1117
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001118def tostring(element, encoding=None, method=None, *,
1119 short_empty_elements=True):
Eli Bendersky84fae782013-03-09 07:12:48 -08001120 """Generate string representation of XML element.
1121
1122 All subelements are included. If encoding is "unicode", a string
1123 is returned. Otherwise a bytestring is returned.
1124
1125 *element* is an Element instance, *encoding* is an optional output
1126 encoding defaulting to US-ASCII, *method* is an optional output which can
1127 be one of "xml" (default), "html", "text" or "c14n".
1128
1129 Returns an (optionally) encoded string containing the XML data.
1130
1131 """
Eli Bendersky00f402b2012-07-15 06:02:22 +03001132 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001133 ElementTree(element).write(stream, encoding, method=method,
1134 short_empty_elements=short_empty_elements)
Eli Bendersky00f402b2012-07-15 06:02:22 +03001135 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001136
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001137class _ListDataStream(io.BufferedIOBase):
Eli Bendersky84fae782013-03-09 07:12:48 -08001138 """An auxiliary stream accumulating into a list reference."""
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001139 def __init__(self, lst):
1140 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001141
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001142 def writable(self):
1143 return True
1144
1145 def seekable(self):
1146 return True
1147
1148 def write(self, b):
1149 self.lst.append(b)
1150
1151 def tell(self):
1152 return len(self.lst)
1153
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001154def tostringlist(element, encoding=None, method=None, *,
1155 short_empty_elements=True):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001156 lst = []
1157 stream = _ListDataStream(lst)
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001158 ElementTree(element).write(stream, encoding, method=method,
1159 short_empty_elements=short_empty_elements)
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001160 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001161
Armin Rigo9ed73062005-12-14 18:10:45 +00001162
1163def dump(elem):
Eli Bendersky84fae782013-03-09 07:12:48 -08001164 """Write element tree or element structure to sys.stdout.
1165
1166 This function should be used for debugging only.
1167
1168 *elem* is either an ElementTree, or a single Element. The exact output
1169 format is implementation dependent. In this version, it's written as an
1170 ordinary XML file.
1171
1172 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001173 # debugging
1174 if not isinstance(elem, ElementTree):
1175 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001176 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001177 tail = elem.getroot().tail
1178 if not tail or tail[-1] != "\n":
1179 sys.stdout.write("\n")
1180
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001181# --------------------------------------------------------------------
1182# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001183
Armin Rigo9ed73062005-12-14 18:10:45 +00001184
1185def parse(source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001186 """Parse XML document into element tree.
1187
1188 *source* is a filename or file object containing XML data,
1189 *parser* is an optional parser instance defaulting to XMLParser.
1190
1191 Return an ElementTree instance.
1192
1193 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001194 tree = ElementTree()
1195 tree.parse(source, parser)
1196 return tree
1197
Armin Rigo9ed73062005-12-14 18:10:45 +00001198
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001199def iterparse(source, events=None, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001200 """Incrementally parse XML document into ElementTree.
1201
1202 This class also reports what's going on to the user based on the
1203 *events* it is initialized with. The supported events are the strings
1204 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1205 detailed namespace information). If *events* is omitted, only
1206 "end" events are reported.
1207
1208 *source* is a filename or file object containing XML data, *events* is
1209 a list of events to report back, *parser* is an optional parser instance.
1210
1211 Returns an iterator providing (event, elem) pairs.
1212
1213 """
Antoine Pitroue033e062010-10-29 10:38:18 +00001214 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001215 if not hasattr(source, "read"):
1216 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001217 close_source = True
Antoine Pitroue033e062010-10-29 10:38:18 +00001218 return _IterParseIterator(source, events, parser, close_source)
Armin Rigo9ed73062005-12-14 18:10:45 +00001219
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001220
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001221class IncrementalParser:
1222
1223 def __init__(self, events=None, parser=None):
1224 # _elementtree.c expects a list, not a deque
1225 self._events_queue = []
Armin Rigo9ed73062005-12-14 18:10:45 +00001226 self._index = 0
1227 self.root = self._root = None
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001228 if not parser:
1229 parser = XMLParser(target=TreeBuilder())
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001230 self._parser = parser
Armin Rigo9ed73062005-12-14 18:10:45 +00001231 # wire up the parser for event reporting
Armin Rigo9ed73062005-12-14 18:10:45 +00001232 if events is None:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001233 events = ("end",)
1234 self._parser._setevents(self._events_queue, events)
1235
1236 def data_received(self, data):
1237 if self._parser is None:
1238 raise ValueError("data_received() called after end of stream")
1239 if data:
1240 try:
1241 self._parser.feed(data)
1242 except SyntaxError as exc:
1243 self._events_queue.append(exc)
1244
1245 def eof_received(self):
1246 self._root = self._parser.close()
1247 self._parser = None
1248 if self._index >= len(self._events_queue):
1249 self.root = self._root
1250
1251 def events(self):
1252 events = self._events_queue
1253 while True:
1254 index = self._index
1255 try:
1256 event = events[self._index]
1257 # Avoid retaining references to past events
1258 events[self._index] = None
1259 except IndexError:
1260 break
1261 index += 1
1262 # Compact the list in a O(1) amortized fashion
1263 if index * 2 >= len(events):
1264 events[:index] = []
1265 self._index = 0
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001266 else:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001267 self._index = index
1268 if isinstance(event, Exception):
1269 raise event
1270 else:
1271 yield event
1272 if self._parser is None:
1273 self.root = self._root
1274
1275
1276class _IterParseIterator(IncrementalParser):
1277
1278 def __init__(self, source, events, parser, close_source=False):
1279 IncrementalParser.__init__(self, events, parser)
1280 self._file = source
1281 self._close_file = close_source
Armin Rigo9ed73062005-12-14 18:10:45 +00001282
Georg Brandla18af4e2007-04-21 15:47:16 +00001283 def __next__(self):
Armin Rigo9ed73062005-12-14 18:10:45 +00001284 while 1:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001285 for event in self.events():
1286 return event
Florent Xicluna91d51932011-11-01 23:31:09 +01001287 if self._parser is None:
Florent Xicluna91d51932011-11-01 23:31:09 +01001288 if self._close_file:
1289 self._file.close()
1290 raise StopIteration
1291 # load event buffer
Florent Xicluna91d51932011-11-01 23:31:09 +01001292 data = self._file.read(16384)
1293 if data:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001294 self.data_received(data)
Florent Xicluna91d51932011-11-01 23:31:09 +01001295 else:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001296 self.eof_received()
Armin Rigo9ed73062005-12-14 18:10:45 +00001297
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001298 def __iter__(self):
1299 return self
Armin Rigo9ed73062005-12-14 18:10:45 +00001300
Armin Rigo9ed73062005-12-14 18:10:45 +00001301
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001302def XML(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001303 """Parse XML document from string constant.
1304
1305 This function can be used to embed "XML Literals" in Python code.
1306
1307 *text* is a string containing XML data, *parser* is an
1308 optional parser instance, defaulting to the standard XMLParser.
1309
1310 Returns an Element instance.
1311
1312 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001313 if not parser:
1314 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001315 parser.feed(text)
1316 return parser.close()
1317
Armin Rigo9ed73062005-12-14 18:10:45 +00001318
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001319def XMLID(text, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001320 """Parse XML document from string constant for its IDs.
1321
1322 *text* is a string containing XML data, *parser* is an
1323 optional parser instance, defaulting to the standard XMLParser.
1324
1325 Returns an (Element, dict) tuple, in which the
1326 dict maps element id:s to elements.
1327
1328 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001329 if not parser:
1330 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001331 parser.feed(text)
1332 tree = parser.close()
1333 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001334 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001335 id = elem.get("id")
1336 if id:
1337 ids[id] = elem
1338 return tree, ids
1339
Victor Stinner765531d2013-03-26 01:11:54 +01001340# Parse XML document from string constant. Alias for XML().
Armin Rigo9ed73062005-12-14 18:10:45 +00001341fromstring = XML
Armin Rigo9ed73062005-12-14 18:10:45 +00001342
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001343def fromstringlist(sequence, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001344 """Parse XML document from sequence of string fragments.
1345
1346 *sequence* is a list of other sequence, *parser* is an optional parser
1347 instance, defaulting to the standard XMLParser.
1348
1349 Returns an Element instance.
1350
1351 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001352 if not parser:
1353 parser = XMLParser(target=TreeBuilder())
1354 for text in sequence:
1355 parser.feed(text)
1356 return parser.close()
1357
1358# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001359
Armin Rigo9ed73062005-12-14 18:10:45 +00001360
1361class TreeBuilder:
Eli Bendersky84fae782013-03-09 07:12:48 -08001362 """Generic element structure builder.
Armin Rigo9ed73062005-12-14 18:10:45 +00001363
Eli Bendersky84fae782013-03-09 07:12:48 -08001364 This builder converts a sequence of start, data, and end method
1365 calls to a well-formed element structure.
1366
1367 You can use this class to build an element structure using a custom XML
1368 parser, or a parser for some other XML-like format.
1369
1370 *element_factory* is an optional element factory which is called
1371 to create new Element instances, as necessary.
1372
1373 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001374 def __init__(self, element_factory=None):
1375 self._data = [] # data collector
1376 self._elem = [] # element stack
1377 self._last = None # last element
1378 self._tail = None # true if we're after an end tag
1379 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001380 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001381 self._factory = element_factory
1382
Armin Rigo9ed73062005-12-14 18:10:45 +00001383 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001384 """Flush builder buffers and return toplevel document Element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001385 assert len(self._elem) == 0, "missing end tags"
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001386 assert self._last is not None, "missing toplevel element"
Armin Rigo9ed73062005-12-14 18:10:45 +00001387 return self._last
1388
1389 def _flush(self):
1390 if self._data:
1391 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001392 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001393 if self._tail:
1394 assert self._last.tail is None, "internal error (tail)"
1395 self._last.tail = text
1396 else:
1397 assert self._last.text is None, "internal error (text)"
1398 self._last.text = text
1399 self._data = []
1400
Armin Rigo9ed73062005-12-14 18:10:45 +00001401
1402 def data(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001403 """Add text to current element."""
Armin Rigo9ed73062005-12-14 18:10:45 +00001404 self._data.append(data)
1405
Armin Rigo9ed73062005-12-14 18:10:45 +00001406
1407 def start(self, tag, attrs):
Eli Bendersky84fae782013-03-09 07:12:48 -08001408 """Open new element and return it.
1409
1410 *tag* is the element name, *attrs* is a dict containing element
1411 attributes.
1412
1413 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001414 self._flush()
1415 self._last = elem = self._factory(tag, attrs)
1416 if self._elem:
1417 self._elem[-1].append(elem)
1418 self._elem.append(elem)
1419 self._tail = 0
1420 return elem
1421
Armin Rigo9ed73062005-12-14 18:10:45 +00001422
1423 def end(self, tag):
Eli Bendersky84fae782013-03-09 07:12:48 -08001424 """Close and return current Element.
1425
1426 *tag* is the element name.
1427
1428 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001429 self._flush()
1430 self._last = self._elem.pop()
1431 assert self._last.tag == tag,\
1432 "end tag mismatch (expected %s, got %s)" % (
1433 self._last.tag, tag)
1434 self._tail = 1
1435 return self._last
1436
Armin Rigo9ed73062005-12-14 18:10:45 +00001437
Eli Bendersky84fae782013-03-09 07:12:48 -08001438# also see ElementTree and TreeBuilder
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001439class XMLParser:
Eli Bendersky84fae782013-03-09 07:12:48 -08001440 """Element structure builder for XML source data based on the expat parser.
1441
1442 *html* are predefined HTML entities (not supported currently),
1443 *target* is an optional target object which defaults to an instance of the
1444 standard TreeBuilder class, *encoding* is an optional encoding string
1445 which if given, overrides the encoding specified in the XML file:
1446 http://www.iana.org/assignments/character-sets
1447
1448 """
Armin Rigo9ed73062005-12-14 18:10:45 +00001449
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001450 def __init__(self, html=0, target=None, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +00001451 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001452 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001453 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001454 try:
1455 import pyexpat as expat
1456 except ImportError:
1457 raise ImportError(
1458 "No module named expat; use SimpleXMLTreeBuilder instead"
1459 )
1460 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001461 if target is None:
1462 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001463 # underscored names are provided for compatibility only
1464 self.parser = self._parser = parser
1465 self.target = self._target = target
1466 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001467 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001468 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001469 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001470 if hasattr(target, 'start'):
1471 parser.StartElementHandler = self._start
1472 if hasattr(target, 'end'):
1473 parser.EndElementHandler = self._end
1474 if hasattr(target, 'data'):
1475 parser.CharacterDataHandler = target.data
1476 # miscellaneous callbacks
1477 if hasattr(target, 'comment'):
1478 parser.CommentHandler = target.comment
1479 if hasattr(target, 'pi'):
1480 parser.ProcessingInstructionHandler = target.pi
Armin Rigo9ed73062005-12-14 18:10:45 +00001481 # let expat do the buffering, if supported
1482 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001483 parser.buffer_text = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001484 except AttributeError:
1485 pass
1486 # use new-style attribute handling, if supported
1487 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001488 parser.ordered_attributes = 1
1489 parser.specified_attributes = 1
1490 if hasattr(target, 'start'):
1491 parser.StartElementHandler = self._start_list
Armin Rigo9ed73062005-12-14 18:10:45 +00001492 except AttributeError:
1493 pass
Armin Rigo9ed73062005-12-14 18:10:45 +00001494 self._doctype = None
1495 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001496 try:
1497 self.version = "Expat %d.%d.%d" % expat.version_info
1498 except AttributeError:
1499 pass # unknown
1500
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001501 def _setevents(self, event_list, events):
1502 # Internal API for IncrementalParser
1503 parser = self._parser
1504 append = event_list.append
1505 for event in events:
1506 if event == "start":
Eli Benderskyc9f5ca22013-04-20 09:11:37 -07001507 parser.ordered_attributes = 1
1508 parser.specified_attributes = 1
1509 def handler(tag, attrib_in, event=event, append=append,
1510 start=self._start_list):
1511 append((event, start(tag, attrib_in)))
1512 parser.StartElementHandler = handler
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001513 elif event == "end":
1514 def handler(tag, event=event, append=append,
1515 end=self._end):
1516 append((event, end(tag)))
1517 parser.EndElementHandler = handler
1518 elif event == "start-ns":
1519 def handler(prefix, uri, event=event, append=append):
1520 append((event, (prefix or "", uri or "")))
1521 parser.StartNamespaceDeclHandler = handler
1522 elif event == "end-ns":
1523 def handler(prefix, event=event, append=append):
1524 append((event, None))
1525 parser.EndNamespaceDeclHandler = handler
1526 else:
1527 raise ValueError("unknown event %r" % event)
1528
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001529 def _raiseerror(self, value):
1530 err = ParseError(value)
1531 err.code = value.code
1532 err.position = value.lineno, value.offset
1533 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001534
Armin Rigo9ed73062005-12-14 18:10:45 +00001535 def _fixname(self, key):
1536 # expand qname, and convert name string to ascii, if possible
1537 try:
1538 name = self._names[key]
1539 except KeyError:
1540 name = key
1541 if "}" in name:
1542 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001543 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001544 return name
1545
1546 def _start(self, tag, attrib_in):
1547 fixname = self._fixname
1548 tag = fixname(tag)
1549 attrib = {}
1550 for key, value in attrib_in.items():
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001551 attrib[fixname(key)] = value
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001552 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001553
1554 def _start_list(self, tag, attrib_in):
1555 fixname = self._fixname
1556 tag = fixname(tag)
1557 attrib = {}
1558 if attrib_in:
1559 for i in range(0, len(attrib_in), 2):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001560 attrib[fixname(attrib_in[i])] = attrib_in[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001561 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001562
Armin Rigo9ed73062005-12-14 18:10:45 +00001563 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001564 return self.target.end(self._fixname(tag))
1565
Armin Rigo9ed73062005-12-14 18:10:45 +00001566 def _default(self, text):
1567 prefix = text[:1]
1568 if prefix == "&":
1569 # deal with undefined entities
1570 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001571 data_handler = self.target.data
1572 except AttributeError:
1573 return
1574 try:
1575 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001576 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001577 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001578 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001579 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001580 (text, self.parser.ErrorLineNumber,
1581 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001582 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001583 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001584 err.lineno = self.parser.ErrorLineNumber
1585 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001586 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001587 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1588 self._doctype = [] # inside a doctype declaration
1589 elif self._doctype is not None:
1590 # parse doctype contents
1591 if prefix == ">":
1592 self._doctype = None
1593 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001594 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001595 if not text:
1596 return
1597 self._doctype.append(text)
1598 n = len(self._doctype)
1599 if n > 2:
1600 type = self._doctype[1]
1601 if type == "PUBLIC" and n == 4:
1602 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001603 if pubid:
1604 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001605 elif type == "SYSTEM" and n == 3:
1606 name, type, system = self._doctype
1607 pubid = None
1608 else:
1609 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001610 if hasattr(self.target, "doctype"):
1611 self.target.doctype(name, pubid, system[1:-1])
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001612 elif self.doctype != self._XMLParser__doctype:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001613 # warn about deprecated call
1614 self._XMLParser__doctype(name, pubid, system[1:-1])
1615 self.doctype(name, pubid, system[1:-1])
Armin Rigo9ed73062005-12-14 18:10:45 +00001616 self._doctype = None
1617
Armin Rigo9ed73062005-12-14 18:10:45 +00001618 def doctype(self, name, pubid, system):
Eli Bendersky84fae782013-03-09 07:12:48 -08001619 """(Deprecated) Handle doctype declaration
1620
1621 *name* is the Doctype name, *pubid* is the public identifier,
1622 and *system* is the system identifier.
1623
1624 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001625 warnings.warn(
1626 "This method of XMLParser is deprecated. Define doctype() "
1627 "method on the TreeBuilder target.",
1628 DeprecationWarning,
1629 )
1630
1631 # sentinel, if doctype is redefined in a subclass
1632 __doctype = doctype
Armin Rigo9ed73062005-12-14 18:10:45 +00001633
Armin Rigo9ed73062005-12-14 18:10:45 +00001634 def feed(self, data):
Eli Bendersky84fae782013-03-09 07:12:48 -08001635 """Feed encoded data to parser."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001636 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001637 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001638 except self._error as v:
1639 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001640
Armin Rigo9ed73062005-12-14 18:10:45 +00001641 def close(self):
Eli Bendersky84fae782013-03-09 07:12:48 -08001642 """Finish feeding data to parser and return element structure."""
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001643 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001644 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001645 except self._error as v:
1646 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001647 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001648 close_handler = self.target.close
1649 except AttributeError:
1650 pass
1651 else:
1652 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001653 finally:
1654 # get rid of circular references
1655 del self.parser, self._parser
1656 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001657
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001658
1659# Import the C accelerators
1660try:
1661 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1662 from _elementtree import *
1663except ImportError:
1664 pass
1665else:
Antoine Pitrou5b235d02013-04-18 19:37:06 +02001666 # Overwrite 'ElementTree.parse' to use the C XMLParser
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001667
1668 class ElementTree(ElementTree):
Eli Bendersky84fae782013-03-09 07:12:48 -08001669 __doc__ = ElementTree.__doc__
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001670 def parse(self, source, parser=None):
Eli Bendersky84fae782013-03-09 07:12:48 -08001671 __doc__ = ElementTree.parse.__doc__
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001672 close_source = False
1673 if not hasattr(source, 'read'):
1674 source = open(source, 'rb')
1675 close_source = True
1676 try:
1677 if parser is not None:
1678 while True:
1679 data = source.read(65536)
1680 if not data:
1681 break
1682 parser.feed(data)
1683 self._root = parser.close()
1684 else:
1685 parser = XMLParser()
1686 self._root = parser._parse(source)
1687 return self._root
1688 finally:
1689 if close_source:
1690 source.close()
1691
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001692
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001693# compatibility
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001694XMLTreeBuilder = XMLParser
1695
1696# workaround circular import.
1697try:
1698 from ElementC14N import _serialize_c14n
1699 _serialize["c14n"] = _serialize_c14n
1700except ImportError:
1701 pass