blob: a8e57295c60c622b7dc38545cfd332a73c0fb5d1 [file] [log] [blame]
Armin Rigo9ed73062005-12-14 18:10:45 +00001#
2# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +00003# $Id: ElementTree.py 3440 2008-07-18 14:45:01Z fredrik $
Armin Rigo9ed73062005-12-14 18:10:45 +00004#
Florent Xiclunaf15351d2010-03-13 23:24:31 +00005# light-weight XML support for Python 2.3 and later.
Armin Rigo9ed73062005-12-14 18:10:45 +00006#
Florent Xiclunaf15351d2010-03-13 23:24:31 +00007# history (since 1.2.6):
8# 2005-11-12 fl added tostringlist/fromstringlist helpers
9# 2006-07-05 fl merged in selected changes from the 1.3 sandbox
10# 2006-07-05 fl removed support for 2.1 and earlier
11# 2007-06-21 fl added deprecation/future warnings
12# 2007-08-25 fl added doctype hook, added parser version attribute etc
13# 2007-08-26 fl added new serializer code (better namespace handling, etc)
14# 2007-08-27 fl warn for broken /tag searches on tree level
15# 2007-09-02 fl added html/text methods to serializer (experimental)
16# 2007-09-05 fl added method argument to tostring/tostringlist
17# 2007-09-06 fl improved error handling
18# 2007-09-13 fl added itertext, iterfind; assorted cleanups
19# 2007-12-15 fl added C14N hooks, copy method (experimental)
Armin Rigo9ed73062005-12-14 18:10:45 +000020#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000021# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000022#
23# fredrik@pythonware.com
24# http://www.pythonware.com
25#
26# --------------------------------------------------------------------
27# The ElementTree toolkit is
28#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000029# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000030#
31# By obtaining, using, and/or copying this software and/or its
32# associated documentation, you agree that you have read, understood,
33# and will comply with the following terms and conditions:
34#
35# Permission to use, copy, modify, and distribute this software and
36# its associated documentation for any purpose and without fee is
37# hereby granted, provided that the above copyright notice appears in
38# all copies, and that both that copyright notice and this permission
39# notice appear in supporting documentation, and that the name of
40# Secret Labs AB or the author not be used in advertising or publicity
41# pertaining to distribution of the software without specific, written
42# prior permission.
43#
44# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
45# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
46# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
47# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
48# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
49# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
50# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
51# OF THIS SOFTWARE.
52# --------------------------------------------------------------------
53
Fredrik Lundh63168a52005-12-14 22:29:34 +000054# Licensed to PSF under a Contributor Agreement.
Florent Xiclunaf15351d2010-03-13 23:24:31 +000055# See http://www.python.org/psf/license for licensing details.
Fredrik Lundh63168a52005-12-14 22:29:34 +000056
Armin Rigo9ed73062005-12-14 18:10:45 +000057__all__ = [
58 # public symbols
59 "Comment",
60 "dump",
61 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000062 "fromstring", "fromstringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000063 "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000064 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000065 "PI", "ProcessingInstruction",
66 "QName",
67 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000068 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000069 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000070 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010071 "XML", "XMLID",
Thomas Wouters0e3f5912006-08-11 14:57:12 +000072 "XMLParser", "XMLTreeBuilder",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010073 "register_namespace",
Armin Rigo9ed73062005-12-14 18:10:45 +000074 ]
75
Florent Xiclunaf15351d2010-03-13 23:24:31 +000076VERSION = "1.3.0"
77
Armin Rigo9ed73062005-12-14 18:10:45 +000078##
79# The <b>Element</b> type is a flexible container object, designed to
80# store hierarchical data structures in memory. The type can be
81# described as a cross between a list and a dictionary.
82# <p>
83# Each element has a number of properties associated with it:
84# <ul>
85# <li>a <i>tag</i>. This is a string identifying what kind of data
86# this element represents (the element type, in other words).</li>
87# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
88# <li>a <i>text</i> string.</li>
89# <li>an optional <i>tail</i> string.</li>
90# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
91# </ul>
92#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000093# To create an element instance, use the {@link #Element} constructor
94# or the {@link #SubElement} factory function.
Armin Rigo9ed73062005-12-14 18:10:45 +000095# <p>
96# The {@link #ElementTree} class can be used to wrap an element
97# structure, and convert it from and to XML.
98##
99
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000100import sys
101import re
102import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +0300103import io
104import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +0000105
Eli Bendersky27cbb192012-06-15 09:03:19 +0300106from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000107
Armin Rigo9ed73062005-12-14 18:10:45 +0000108
109##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000110# Parser error. This is a subclass of <b>SyntaxError</b>.
Armin Rigo9ed73062005-12-14 18:10:45 +0000111# <p>
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000112# In addition to the exception value, an exception instance contains a
113# specific exception code in the <b>code</b> attribute, and the line and
114# column of the error in the <b>position</b> attribute.
115
116class ParseError(SyntaxError):
117 pass
118
119# --------------------------------------------------------------------
120
121##
122# Checks if an object appears to be a valid element object.
Armin Rigo9ed73062005-12-14 18:10:45 +0000123#
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000124# @param An element instance.
125# @return A true value if this is an element object.
126# @defreturn flag
127
128def iselement(element):
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100129 # FIXME: not sure about this;
130 # isinstance(element, Element) or look for tag/attrib/text attributes
131 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000132
133##
134# Element class. This class defines the Element interface, and
135# provides a reference implementation of this interface.
136# <p>
137# The element name, attribute names, and attribute values can be
138# either ASCII strings (ordinary Python strings containing only 7-bit
139# ASCII characters) or Unicode strings.
140#
141# @param tag The element name.
142# @param attrib An optional dictionary, containing element attributes.
143# @param **extra Additional attributes, given as keyword arguments.
Armin Rigo9ed73062005-12-14 18:10:45 +0000144# @see Element
145# @see SubElement
146# @see Comment
147# @see ProcessingInstruction
148
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000149class Element:
Armin Rigo9ed73062005-12-14 18:10:45 +0000150 # <tag attrib>text<child/>...</tag>tail
151
152 ##
153 # (Attribute) Element tag.
154
155 tag = None
156
157 ##
158 # (Attribute) Element attribute dictionary. Where possible, use
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000159 # {@link #Element.get},
160 # {@link #Element.set},
161 # {@link #Element.keys}, and
162 # {@link #Element.items} to access
Armin Rigo9ed73062005-12-14 18:10:45 +0000163 # element attributes.
164
165 attrib = None
166
167 ##
168 # (Attribute) Text before first subelement. This is either a
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000169 # string or the value None. Note that if there was no text, this
170 # attribute may be either None or an empty string, depending on
171 # the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000172
173 text = None
174
175 ##
176 # (Attribute) Text after this element's end tag, but before the
177 # next sibling element's start tag. This is either a string or
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000178 # the value None. Note that if there was no text, this attribute
179 # may be either None or an empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000180
181 tail = None # text after end tag, if any
182
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000183 # constructor
184
185 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300186 if not isinstance(attrib, dict):
187 raise TypeError("attrib must be dict, not %s" % (
188 attrib.__class__.__name__,))
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000189 attrib = attrib.copy()
190 attrib.update(extra)
Armin Rigo9ed73062005-12-14 18:10:45 +0000191 self.tag = tag
192 self.attrib = attrib
193 self._children = []
194
195 def __repr__(self):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000196 return "<Element %s at 0x%x>" % (repr(self.tag), id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000197
198 ##
199 # Creates a new element object of the same type as this element.
200 #
201 # @param tag Element tag.
202 # @param attrib Element attributes, given as a dictionary.
203 # @return A new element instance.
204
205 def makeelement(self, tag, attrib):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000206 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000207
208 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000209 # (Experimental) Copies the current element. This creates a
210 # shallow copy; subelements will be shared with the original tree.
211 #
212 # @return A new element instance.
213
214 def copy(self):
215 elem = self.makeelement(self.tag, self.attrib)
216 elem.text = self.text
217 elem.tail = self.tail
218 elem[:] = self
219 return elem
220
221 ##
222 # Returns the number of subelements. Note that this only counts
223 # full elements; to check if there's any content in an element, you
224 # have to check both the length and the <b>text</b> attribute.
Armin Rigo9ed73062005-12-14 18:10:45 +0000225 #
226 # @return The number of subelements.
227
228 def __len__(self):
229 return len(self._children)
230
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000231 def __bool__(self):
232 warnings.warn(
233 "The behavior of this method will change in future versions. "
234 "Use specific 'len(elem)' or 'elem is not None' test instead.",
235 FutureWarning, stacklevel=2
236 )
237 return len(self._children) != 0 # emulate old behaviour, for now
238
Armin Rigo9ed73062005-12-14 18:10:45 +0000239 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000240 # Returns the given subelement, by index.
Armin Rigo9ed73062005-12-14 18:10:45 +0000241 #
242 # @param index What subelement to return.
243 # @return The given subelement.
244 # @exception IndexError If the given element does not exist.
245
246 def __getitem__(self, index):
247 return self._children[index]
248
249 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000250 # Replaces the given subelement, by index.
Armin Rigo9ed73062005-12-14 18:10:45 +0000251 #
252 # @param index What subelement to replace.
253 # @param element The new element value.
254 # @exception IndexError If the given element does not exist.
Armin Rigo9ed73062005-12-14 18:10:45 +0000255
256 def __setitem__(self, index, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000257 # if isinstance(index, slice):
258 # for elt in element:
259 # assert iselement(elt)
260 # else:
261 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000262 self._children[index] = element
263
264 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000265 # Deletes the given subelement, by index.
Armin Rigo9ed73062005-12-14 18:10:45 +0000266 #
267 # @param index What subelement to delete.
268 # @exception IndexError If the given element does not exist.
269
270 def __delitem__(self, index):
271 del self._children[index]
272
273 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000274 # Adds a subelement to the end of this element. In document order,
275 # the new element will appear after the last existing subelement (or
276 # directly after the text, if it's the first subelement), but before
277 # the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000278 #
279 # @param element The element to add.
Armin Rigo9ed73062005-12-14 18:10:45 +0000280
281 def append(self, element):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200282 self._assert_is_element(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000283 self._children.append(element)
284
285 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000286 # Appends subelements from a sequence.
287 #
288 # @param elements A sequence object with zero or more elements.
289 # @since 1.3
290
291 def extend(self, elements):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200292 for element in elements:
293 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000294 self._children.extend(elements)
295
296 ##
Armin Rigo9ed73062005-12-14 18:10:45 +0000297 # Inserts a subelement at the given position in this element.
298 #
299 # @param index Where to insert the new subelement.
Armin Rigo9ed73062005-12-14 18:10:45 +0000300
301 def insert(self, index, element):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200302 self._assert_is_element(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000303 self._children.insert(index, element)
304
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200305 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200306 # Need to refer to the actual Python implementation, not the
307 # shadowing C implementation.
308 if not isinstance(e, _Element):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200309 raise TypeError('expected an Element, not %s' % type(e).__name__)
310
Armin Rigo9ed73062005-12-14 18:10:45 +0000311 ##
312 # Removes a matching subelement. Unlike the <b>find</b> methods,
313 # this method compares elements based on identity, not on tag
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000314 # value or contents. To remove subelements by other means, the
315 # easiest way is often to use a list comprehension to select what
316 # elements to keep, and use slice assignment to update the parent
317 # element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000318 #
319 # @param element What element to remove.
320 # @exception ValueError If a matching element could not be found.
Armin Rigo9ed73062005-12-14 18:10:45 +0000321
322 def remove(self, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000323 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000324 self._children.remove(element)
325
326 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000327 # (Deprecated) Returns all subelements. The elements are returned
328 # in document order.
Armin Rigo9ed73062005-12-14 18:10:45 +0000329 #
330 # @return A list of subelements.
331 # @defreturn list of Element instances
332
333 def getchildren(self):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000334 warnings.warn(
335 "This method will be removed in future versions. "
336 "Use 'list(elem)' or iteration over elem instead.",
337 DeprecationWarning, stacklevel=2
338 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000339 return self._children
340
341 ##
342 # Finds the first matching subelement, by tag name or path.
343 #
344 # @param path What element to look for.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000345 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000346 # @return The first matching element, or None if no element was found.
347 # @defreturn Element or None
348
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000349 def find(self, path, namespaces=None):
350 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000351
352 ##
353 # Finds text for the first matching subelement, by tag name or path.
354 #
355 # @param path What element to look for.
356 # @param default What to return if the element was not found.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000357 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000358 # @return The text content of the first matching element, or the
359 # default value no element was found. Note that if the element
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000360 # is found, but has no text content, this method returns an
Armin Rigo9ed73062005-12-14 18:10:45 +0000361 # empty string.
362 # @defreturn string
363
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000364 def findtext(self, path, default=None, namespaces=None):
365 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000366
367 ##
368 # Finds all matching subelements, by tag name or path.
369 #
370 # @param path What element to look for.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000371 # @keyparam namespaces Optional namespace prefix map.
372 # @return A list or other sequence containing all matching elements,
Armin Rigo9ed73062005-12-14 18:10:45 +0000373 # in document order.
374 # @defreturn list of Element instances
375
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000376 def findall(self, path, namespaces=None):
377 return ElementPath.findall(self, path, namespaces)
378
379 ##
380 # Finds all matching subelements, by tag name or path.
381 #
382 # @param path What element to look for.
383 # @keyparam namespaces Optional namespace prefix map.
384 # @return An iterator or sequence containing all matching elements,
385 # in document order.
386 # @defreturn a generated sequence of Element instances
387
388 def iterfind(self, path, namespaces=None):
389 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000390
391 ##
392 # Resets an element. This function removes all subelements, clears
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000393 # all attributes, and sets the <b>text</b> and <b>tail</b> attributes
394 # to None.
Armin Rigo9ed73062005-12-14 18:10:45 +0000395
396 def clear(self):
397 self.attrib.clear()
398 self._children = []
399 self.text = self.tail = None
400
401 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000402 # Gets an element attribute. Equivalent to <b>attrib.get</b>, but
403 # some implementations may handle this a bit more efficiently.
Armin Rigo9ed73062005-12-14 18:10:45 +0000404 #
405 # @param key What attribute to look for.
406 # @param default What to return if the attribute was not found.
407 # @return The attribute value, or the default value, if the
408 # attribute was not found.
409 # @defreturn string or None
410
411 def get(self, key, default=None):
412 return self.attrib.get(key, default)
413
414 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000415 # Sets an element attribute. Equivalent to <b>attrib[key] = value</b>,
416 # but some implementations may handle this a bit more efficiently.
Armin Rigo9ed73062005-12-14 18:10:45 +0000417 #
418 # @param key What attribute to set.
419 # @param value The attribute value.
420
421 def set(self, key, value):
422 self.attrib[key] = value
423
424 ##
425 # Gets a list of attribute names. The names are returned in an
426 # arbitrary order (just like for an ordinary Python dictionary).
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000427 # Equivalent to <b>attrib.keys()</b>.
Armin Rigo9ed73062005-12-14 18:10:45 +0000428 #
429 # @return A list of element attribute names.
430 # @defreturn list of strings
431
432 def keys(self):
433 return self.attrib.keys()
434
435 ##
436 # Gets element attributes, as a sequence. The attributes are
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000437 # returned in an arbitrary order. Equivalent to <b>attrib.items()</b>.
Armin Rigo9ed73062005-12-14 18:10:45 +0000438 #
439 # @return A list of (name, value) tuples for all attributes.
440 # @defreturn list of (string, string) tuples
441
442 def items(self):
443 return self.attrib.items()
444
445 ##
446 # Creates a tree iterator. The iterator loops over this element
447 # and all subelements, in document order, and returns all elements
448 # with a matching tag.
449 # <p>
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000450 # If the tree structure is modified during iteration, new or removed
451 # elements may or may not be included. To get a stable set, use the
452 # list() function on the iterator, and loop over the resulting list.
Armin Rigo9ed73062005-12-14 18:10:45 +0000453 #
454 # @param tag What tags to look for (default is to return all elements).
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000455 # @return An iterator containing all the matching elements.
456 # @defreturn iterator
Armin Rigo9ed73062005-12-14 18:10:45 +0000457
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000458 def iter(self, tag=None):
Armin Rigo9ed73062005-12-14 18:10:45 +0000459 if tag == "*":
460 tag = None
461 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000462 yield self
463 for e in self._children:
464 for e in e.iter(tag):
465 yield e
466
467 # compatibility
468 def getiterator(self, tag=None):
469 # Change for a DeprecationWarning in 1.4
470 warnings.warn(
471 "This method will be removed in future versions. "
472 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
473 PendingDeprecationWarning, stacklevel=2
474 )
475 return list(self.iter(tag))
476
477 ##
478 # Creates a text iterator. The iterator loops over this element
479 # and all subelements, in document order, and returns all inner
480 # text.
481 #
482 # @return An iterator containing all inner text.
483 # @defreturn iterator
484
485 def itertext(self):
486 tag = self.tag
487 if not isinstance(tag, str) and tag is not None:
488 return
489 if self.text:
490 yield self.text
491 for e in self:
492 for s in e.itertext():
493 yield s
494 if e.tail:
495 yield e.tail
Armin Rigo9ed73062005-12-14 18:10:45 +0000496
497# compatibility
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000498_Element = _ElementInterface = Element
Armin Rigo9ed73062005-12-14 18:10:45 +0000499
500##
501# Subelement factory. This function creates an element instance, and
502# appends it to an existing element.
503# <p>
504# The element name, attribute names, and attribute values can be
505# either 8-bit ASCII strings or Unicode strings.
506#
507# @param parent The parent element.
508# @param tag The subelement name.
509# @param attrib An optional dictionary, containing element attributes.
510# @param **extra Additional attributes, given as keyword arguments.
511# @return An element instance.
512# @defreturn Element
513
514def SubElement(parent, tag, attrib={}, **extra):
515 attrib = attrib.copy()
516 attrib.update(extra)
517 element = parent.makeelement(tag, attrib)
518 parent.append(element)
519 return element
520
521##
522# Comment element factory. This factory function creates a special
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000523# element that will be serialized as an XML comment by the standard
524# serializer.
Armin Rigo9ed73062005-12-14 18:10:45 +0000525# <p>
526# The comment string can be either an 8-bit ASCII string or a Unicode
527# string.
528#
529# @param text A string containing the comment string.
530# @return An element instance, representing a comment.
531# @defreturn Element
532
533def Comment(text=None):
534 element = Element(Comment)
535 element.text = text
536 return element
537
538##
539# PI element factory. This factory function creates a special element
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000540# that will be serialized as an XML processing instruction by the standard
541# serializer.
Armin Rigo9ed73062005-12-14 18:10:45 +0000542#
543# @param target A string containing the PI target.
544# @param text A string containing the PI contents, if any.
545# @return An element instance, representing a PI.
546# @defreturn Element
547
548def ProcessingInstruction(target, text=None):
549 element = Element(ProcessingInstruction)
550 element.text = target
551 if text:
552 element.text = element.text + " " + text
553 return element
554
555PI = ProcessingInstruction
556
557##
558# QName wrapper. This can be used to wrap a QName attribute value, in
559# order to get proper namespace handling on output.
560#
561# @param text A string containing the QName value, in the form {uri}local,
562# or, if the tag argument is given, the URI part of a QName.
563# @param tag Optional tag. If given, the first argument is interpreted as
564# an URI, and this argument is interpreted as a local name.
565# @return An opaque object, representing the QName.
566
567class QName:
568 def __init__(self, text_or_uri, tag=None):
569 if tag:
570 text_or_uri = "{%s}%s" % (text_or_uri, tag)
571 self.text = text_or_uri
572 def __str__(self):
573 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000574 def __repr__(self):
Georg Brandlc95c9182010-12-09 18:26:02 +0000575 return '<QName %r>' % (self.text,)
Armin Rigo9ed73062005-12-14 18:10:45 +0000576 def __hash__(self):
577 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000578 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000579 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000580 return self.text <= other.text
581 return self.text <= other
582 def __lt__(self, other):
583 if isinstance(other, QName):
584 return self.text < other.text
585 return self.text < other
586 def __ge__(self, other):
587 if isinstance(other, QName):
588 return self.text >= other.text
589 return self.text >= other
590 def __gt__(self, other):
591 if isinstance(other, QName):
592 return self.text > other.text
593 return self.text > other
594 def __eq__(self, other):
595 if isinstance(other, QName):
596 return self.text == other.text
597 return self.text == other
598 def __ne__(self, other):
599 if isinstance(other, QName):
600 return self.text != other.text
601 return self.text != other
Armin Rigo9ed73062005-12-14 18:10:45 +0000602
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000603# --------------------------------------------------------------------
604
Armin Rigo9ed73062005-12-14 18:10:45 +0000605##
606# ElementTree wrapper class. This class represents an entire element
607# hierarchy, and adds some extra support for serialization to and from
608# standard XML.
609#
610# @param element Optional root element.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000611# @keyparam file Optional file handle or file name. If given, the
Armin Rigo9ed73062005-12-14 18:10:45 +0000612# tree is initialized with the contents of this XML file.
613
614class ElementTree:
615
616 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000617 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000618 self._root = element # first node
619 if file:
620 self.parse(file)
621
622 ##
623 # Gets the root element for this tree.
624 #
625 # @return An element instance.
626 # @defreturn Element
627
628 def getroot(self):
629 return self._root
630
631 ##
632 # Replaces the root element for this tree. This discards the
633 # current contents of the tree, and replaces it with the given
634 # element. Use with care.
635 #
636 # @param element An element instance.
637
638 def _setroot(self, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000639 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000640 self._root = element
641
642 ##
643 # Loads an external XML document into this element tree.
644 #
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000645 # @param source A file name or file object. If a file object is
646 # given, it only has to implement a <b>read(n)</b> method.
647 # @keyparam parser An optional parser instance. If not given, the
648 # standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +0000649 # @return The document root element.
650 # @defreturn Element
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000651 # @exception ParseError If the parser fails to parse the document.
Armin Rigo9ed73062005-12-14 18:10:45 +0000652
653 def parse(self, source, parser=None):
Antoine Pitroue033e062010-10-29 10:38:18 +0000654 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000655 if not hasattr(source, "read"):
656 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000657 close_source = True
658 try:
659 if not parser:
660 parser = XMLParser(target=TreeBuilder())
661 while 1:
662 data = source.read(65536)
663 if not data:
664 break
665 parser.feed(data)
666 self._root = parser.close()
667 return self._root
668 finally:
669 if close_source:
670 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000671
672 ##
673 # Creates a tree iterator for the root element. The iterator loops
674 # over all elements in this tree, in document order.
675 #
676 # @param tag What tags to look for (default is to return all elements)
677 # @return An iterator.
678 # @defreturn iterator
679
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000680 def iter(self, tag=None):
681 # assert self._root is not None
682 return self._root.iter(tag)
683
684 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000685 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000686 # Change for a DeprecationWarning in 1.4
687 warnings.warn(
688 "This method will be removed in future versions. "
689 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
690 PendingDeprecationWarning, stacklevel=2
691 )
692 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000693
694 ##
Eli Bendersky7343cb02013-03-12 06:01:22 -0700695 # Same as getroot().find(path), starting at the root of the tree.
Armin Rigo9ed73062005-12-14 18:10:45 +0000696 #
697 # @param path What element to look for.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000698 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000699 # @return The first matching element, or None if no element was found.
700 # @defreturn Element or None
701
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000702 def find(self, path, namespaces=None):
703 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000704 if path[:1] == "/":
705 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000706 warnings.warn(
707 "This search is broken in 1.3 and earlier, and will be "
708 "fixed in a future version. If you rely on the current "
709 "behaviour, change it to %r" % path,
710 FutureWarning, stacklevel=2
711 )
712 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000713
714 ##
Eli Bendersky7343cb02013-03-12 06:01:22 -0700715 # Same as getroot().findtext(path), starting at the root of the tree.
Armin Rigo9ed73062005-12-14 18:10:45 +0000716 #
Eli Bendersky7343cb02013-03-12 06:01:22 -0700717 # @param path What element to look for.
Armin Rigo9ed73062005-12-14 18:10:45 +0000718 # @param default What to return if the element was not found.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000719 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000720 # @return The text content of the first matching element, or the
721 # default value no element was found. Note that if the element
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000722 # is found, but has no text content, this method returns an
Armin Rigo9ed73062005-12-14 18:10:45 +0000723 # empty string.
724 # @defreturn string
725
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000726 def findtext(self, path, default=None, namespaces=None):
727 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000728 if path[:1] == "/":
729 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000730 warnings.warn(
731 "This search is broken in 1.3 and earlier, and will be "
732 "fixed in a future version. If you rely on the current "
733 "behaviour, change it to %r" % path,
734 FutureWarning, stacklevel=2
735 )
736 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000737
738 ##
Eli Bendersky7343cb02013-03-12 06:01:22 -0700739 # Same as getroot().findall(path), starting at the root of the tree.
Armin Rigo9ed73062005-12-14 18:10:45 +0000740 #
741 # @param path What element to look for.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000742 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000743 # @return A list or iterator containing all matching elements,
744 # in document order.
745 # @defreturn list of Element instances
746
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000747 def findall(self, path, namespaces=None):
748 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000749 if path[:1] == "/":
750 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000751 warnings.warn(
752 "This search is broken in 1.3 and earlier, and will be "
753 "fixed in a future version. If you rely on the current "
754 "behaviour, change it to %r" % path,
755 FutureWarning, stacklevel=2
756 )
757 return self._root.findall(path, namespaces)
758
759 ##
760 # Finds all matching subelements, by tag name or path.
761 # Same as getroot().iterfind(path).
762 #
763 # @param path What element to look for.
764 # @keyparam namespaces Optional namespace prefix map.
765 # @return An iterator or sequence containing all matching elements,
766 # in document order.
767 # @defreturn a generated sequence of Element instances
768
769 def iterfind(self, path, namespaces=None):
770 # assert self._root is not None
771 if path[:1] == "/":
772 path = "." + path
773 warnings.warn(
774 "This search is broken in 1.3 and earlier, and will be "
775 "fixed in a future version. If you rely on the current "
776 "behaviour, change it to %r" % path,
777 FutureWarning, stacklevel=2
778 )
779 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000780
781 ##
782 # Writes the element tree to a file, as XML.
783 #
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000784 # @def write(file, **options)
Armin Rigo9ed73062005-12-14 18:10:45 +0000785 # @param file A file name, or a file object opened for writing.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000786 # @param **options Options, given as keyword arguments.
Florent Xiclunac17f1722010-08-08 19:48:29 +0000787 # @keyparam encoding Optional output encoding (default is US-ASCII).
788 # Use "unicode" to return a Unicode string.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000789 # @keyparam xml_declaration Controls if an XML declaration should
790 # be added to the file. Use False for never, True for always,
Florent Xiclunac17f1722010-08-08 19:48:29 +0000791 # None for only if not US-ASCII or UTF-8 or Unicode. None is default.
Serhiy Storchaka03530b92013-01-13 21:58:04 +0200792 # @keyparam default_namespace Sets the default XML namespace (for "xmlns").
793 # @keyparam method Optional output method ("xml", "html", "text" or
794 # "c14n"; default is "xml").
Armin Rigo9ed73062005-12-14 18:10:45 +0000795
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000796 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000797 encoding=None,
798 xml_declaration=None,
799 default_namespace=None,
800 method=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000801 if not method:
802 method = "xml"
803 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000804 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000805 if not encoding:
806 if method == "c14n":
807 encoding = "utf-8"
808 else:
809 encoding = "us-ascii"
Florent Xiclunac17f1722010-08-08 19:48:29 +0000810 else:
811 encoding = encoding.lower()
Eli Bendersky00f402b2012-07-15 06:02:22 +0300812 with _get_writer(file_or_filename, encoding) as write:
813 if method == "xml" and (xml_declaration or
814 (xml_declaration is None and
815 encoding not in ("utf-8", "us-ascii", "unicode"))):
816 declared_encoding = encoding
817 if encoding == "unicode":
818 # Retrieve the default encoding for the xml declaration
819 import locale
820 declared_encoding = locale.getpreferredencoding()
821 write("<?xml version='1.0' encoding='%s'?>\n" % (
822 declared_encoding,))
823 if method == "text":
824 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000825 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300826 qnames, namespaces = _namespaces(self._root, default_namespace)
827 serialize = _serialize[method]
828 serialize(write, self._root, qnames, namespaces)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000829
830 def write_c14n(self, file):
831 # lxml.etree compatibility. use output method instead
832 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000833
834# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000835# serialization support
836
Eli Bendersky00f402b2012-07-15 06:02:22 +0300837@contextlib.contextmanager
838def _get_writer(file_or_filename, encoding):
Ezio Melottib5bc3532013-08-17 16:11:40 +0300839 # returns text write method and release all resources after using
Eli Bendersky00f402b2012-07-15 06:02:22 +0300840 try:
841 write = file_or_filename.write
842 except AttributeError:
843 # file_or_filename is a file name
844 if encoding == "unicode":
845 file = open(file_or_filename, "w")
846 else:
847 file = open(file_or_filename, "w", encoding=encoding,
848 errors="xmlcharrefreplace")
849 with file:
850 yield file.write
851 else:
852 # file_or_filename is a file-like object
853 # encoding determines if it is a text or binary writer
854 if encoding == "unicode":
855 # use a text writer as is
856 yield write
857 else:
858 # wrap a binary writer with TextIOWrapper
859 with contextlib.ExitStack() as stack:
860 if isinstance(file_or_filename, io.BufferedIOBase):
861 file = file_or_filename
862 elif isinstance(file_or_filename, io.RawIOBase):
863 file = io.BufferedWriter(file_or_filename)
864 # Keep the original file open when the BufferedWriter is
865 # destroyed
866 stack.callback(file.detach)
867 else:
868 # This is to handle passed objects that aren't in the
869 # IOBase hierarchy, but just have a write method
870 file = io.BufferedIOBase()
871 file.writable = lambda: True
872 file.write = write
873 try:
874 # TextIOWrapper uses this methods to determine
875 # if BOM (for UTF-16, etc) should be added
876 file.seekable = file_or_filename.seekable
877 file.tell = file_or_filename.tell
878 except AttributeError:
879 pass
880 file = io.TextIOWrapper(file,
881 encoding=encoding,
882 errors="xmlcharrefreplace",
883 newline="\n")
884 # Keep the original file open when the TextIOWrapper is
885 # destroyed
886 stack.callback(file.detach)
887 yield file.write
888
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000889def _namespaces(elem, default_namespace=None):
890 # identify namespaces used in this tree
891
892 # maps qnames to *encoded* prefix:local names
893 qnames = {None: None}
894
895 # maps uri:s to prefixes
896 namespaces = {}
897 if default_namespace:
898 namespaces[default_namespace] = ""
899
900 def add_qname(qname):
901 # calculate serialized qname representation
902 try:
903 if qname[:1] == "{":
904 uri, tag = qname[1:].rsplit("}", 1)
905 prefix = namespaces.get(uri)
906 if prefix is None:
907 prefix = _namespace_map.get(uri)
908 if prefix is None:
909 prefix = "ns%d" % len(namespaces)
910 if prefix != "xml":
911 namespaces[uri] = prefix
912 if prefix:
913 qnames[qname] = "%s:%s" % (prefix, tag)
914 else:
915 qnames[qname] = tag # default element
916 else:
917 if default_namespace:
918 # FIXME: can this be handled in XML 1.0?
919 raise ValueError(
920 "cannot use non-qualified names with "
921 "default_namespace option"
922 )
923 qnames[qname] = qname
924 except TypeError:
925 _raise_serialization_error(qname)
926
927 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300928 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000929 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000930 if isinstance(tag, QName):
931 if tag.text not in qnames:
932 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000933 elif isinstance(tag, str):
934 if tag not in qnames:
935 add_qname(tag)
936 elif tag is not None and tag is not Comment and tag is not PI:
937 _raise_serialization_error(tag)
938 for key, value in elem.items():
939 if isinstance(key, QName):
940 key = key.text
941 if key not in qnames:
942 add_qname(key)
943 if isinstance(value, QName) and value.text not in qnames:
944 add_qname(value.text)
945 text = elem.text
946 if isinstance(text, QName) and text.text not in qnames:
947 add_qname(text.text)
948 return qnames, namespaces
949
950def _serialize_xml(write, elem, qnames, namespaces):
951 tag = elem.tag
952 text = elem.text
953 if tag is Comment:
954 write("<!--%s-->" % text)
955 elif tag is ProcessingInstruction:
956 write("<?%s?>" % text)
957 else:
958 tag = qnames[tag]
959 if tag is None:
960 if text:
961 write(_escape_cdata(text))
962 for e in elem:
963 _serialize_xml(write, e, qnames, None)
964 else:
965 write("<" + tag)
966 items = list(elem.items())
967 if items or namespaces:
968 if namespaces:
969 for v, k in sorted(namespaces.items(),
970 key=lambda x: x[1]): # sort on prefix
971 if k:
972 k = ":" + k
973 write(" xmlns%s=\"%s\"" % (
974 k,
975 _escape_attrib(v)
976 ))
977 for k, v in sorted(items): # lexical order
978 if isinstance(k, QName):
979 k = k.text
980 if isinstance(v, QName):
981 v = qnames[v.text]
982 else:
983 v = _escape_attrib(v)
984 write(" %s=\"%s\"" % (qnames[k], v))
985 if text or len(elem):
986 write(">")
987 if text:
988 write(_escape_cdata(text))
989 for e in elem:
990 _serialize_xml(write, e, qnames, None)
991 write("</" + tag + ">")
992 else:
993 write(" />")
994 if elem.tail:
995 write(_escape_cdata(elem.tail))
996
997HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +0300998 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000999
1000try:
1001 HTML_EMPTY = set(HTML_EMPTY)
1002except NameError:
1003 pass
1004
1005def _serialize_html(write, elem, qnames, namespaces):
1006 tag = elem.tag
1007 text = elem.text
1008 if tag is Comment:
1009 write("<!--%s-->" % _escape_cdata(text))
1010 elif tag is ProcessingInstruction:
1011 write("<?%s?>" % _escape_cdata(text))
1012 else:
1013 tag = qnames[tag]
1014 if tag is None:
1015 if text:
1016 write(_escape_cdata(text))
1017 for e in elem:
1018 _serialize_html(write, e, qnames, None)
1019 else:
1020 write("<" + tag)
1021 items = list(elem.items())
1022 if items or namespaces:
1023 if namespaces:
1024 for v, k in sorted(namespaces.items(),
1025 key=lambda x: x[1]): # sort on prefix
1026 if k:
1027 k = ":" + k
1028 write(" xmlns%s=\"%s\"" % (
1029 k,
1030 _escape_attrib(v)
1031 ))
1032 for k, v in sorted(items): # lexical order
1033 if isinstance(k, QName):
1034 k = k.text
1035 if isinstance(v, QName):
1036 v = qnames[v.text]
1037 else:
1038 v = _escape_attrib_html(v)
1039 # FIXME: handle boolean attributes
1040 write(" %s=\"%s\"" % (qnames[k], v))
1041 write(">")
Christian Heimes54ad7e32013-07-05 01:39:49 +02001042 ltag = tag.lower()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001043 if text:
Christian Heimes54ad7e32013-07-05 01:39:49 +02001044 if ltag == "script" or ltag == "style":
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001045 write(text)
1046 else:
1047 write(_escape_cdata(text))
1048 for e in elem:
1049 _serialize_html(write, e, qnames, None)
Christian Heimes54ad7e32013-07-05 01:39:49 +02001050 if ltag not in HTML_EMPTY:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001051 write("</" + tag + ">")
1052 if elem.tail:
1053 write(_escape_cdata(elem.tail))
1054
1055def _serialize_text(write, elem):
1056 for part in elem.itertext():
1057 write(part)
1058 if elem.tail:
1059 write(elem.tail)
1060
1061_serialize = {
1062 "xml": _serialize_xml,
1063 "html": _serialize_html,
1064 "text": _serialize_text,
1065# this optional method is imported at the end of the module
1066# "c14n": _serialize_c14n,
1067}
Armin Rigo9ed73062005-12-14 18:10:45 +00001068
1069##
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001070# Registers a namespace prefix. The registry is global, and any
1071# existing mapping for either the given prefix or the namespace URI
1072# will be removed.
Armin Rigo9ed73062005-12-14 18:10:45 +00001073#
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001074# @param prefix Namespace prefix.
1075# @param uri Namespace uri. Tags and attributes in this namespace
1076# will be serialized with the given prefix, if at all possible.
1077# @exception ValueError If the prefix is reserved, or is otherwise
1078# invalid.
Armin Rigo9ed73062005-12-14 18:10:45 +00001079
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001080def register_namespace(prefix, uri):
1081 if re.match("ns\d+$", prefix):
1082 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001083 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001084 if k == uri or v == prefix:
1085 del _namespace_map[k]
1086 _namespace_map[uri] = prefix
1087
1088_namespace_map = {
1089 # "well-known" namespace prefixes
1090 "http://www.w3.org/XML/1998/namespace": "xml",
1091 "http://www.w3.org/1999/xhtml": "html",
1092 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1093 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1094 # xml schema
1095 "http://www.w3.org/2001/XMLSchema": "xs",
1096 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1097 # dublin core
1098 "http://purl.org/dc/elements/1.1/": "dc",
1099}
Florent Xicluna16395052012-02-16 23:28:35 +01001100# For tests and troubleshooting
1101register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001102
1103def _raise_serialization_error(text):
1104 raise TypeError(
1105 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1106 )
1107
1108def _escape_cdata(text):
1109 # escape character data
1110 try:
1111 # it's worth avoiding do-nothing calls for strings that are
1112 # shorter than 500 character, or so. assume that's, by far,
1113 # the most common case in most applications.
1114 if "&" in text:
1115 text = text.replace("&", "&amp;")
1116 if "<" in text:
1117 text = text.replace("<", "&lt;")
1118 if ">" in text:
1119 text = text.replace(">", "&gt;")
1120 return text
1121 except (TypeError, AttributeError):
1122 _raise_serialization_error(text)
1123
1124def _escape_attrib(text):
1125 # escape attribute value
1126 try:
1127 if "&" in text:
1128 text = text.replace("&", "&amp;")
1129 if "<" in text:
1130 text = text.replace("<", "&lt;")
1131 if ">" in text:
1132 text = text.replace(">", "&gt;")
1133 if "\"" in text:
1134 text = text.replace("\"", "&quot;")
1135 if "\n" in text:
1136 text = text.replace("\n", "&#10;")
1137 return text
1138 except (TypeError, AttributeError):
1139 _raise_serialization_error(text)
1140
1141def _escape_attrib_html(text):
1142 # escape attribute value
1143 try:
1144 if "&" in text:
1145 text = text.replace("&", "&amp;")
1146 if ">" in text:
1147 text = text.replace(">", "&gt;")
1148 if "\"" in text:
1149 text = text.replace("\"", "&quot;")
1150 return text
1151 except (TypeError, AttributeError):
1152 _raise_serialization_error(text)
1153
1154# --------------------------------------------------------------------
1155
1156##
1157# Generates a string representation of an XML element, including all
Florent Xiclunac17f1722010-08-08 19:48:29 +00001158# subelements. If encoding is "unicode", the return type is a string;
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001159# otherwise it is a bytes array.
1160#
1161# @param element An Element instance.
Florent Xiclunac17f1722010-08-08 19:48:29 +00001162# @keyparam encoding Optional output encoding (default is US-ASCII).
1163# Use "unicode" to return a Unicode string.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001164# @keyparam method Optional output method ("xml", "html", "text" or
1165# "c14n"; default is "xml").
1166# @return An (optionally) encoded string containing the XML data.
1167# @defreturn string
1168
1169def tostring(element, encoding=None, method=None):
Eli Bendersky00f402b2012-07-15 06:02:22 +03001170 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
1171 ElementTree(element).write(stream, encoding, method=method)
1172 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001173
1174##
1175# Generates a string representation of an XML element, including all
Eli Bendersky00f402b2012-07-15 06:02:22 +03001176# subelements.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001177#
1178# @param element An Element instance.
1179# @keyparam encoding Optional output encoding (default is US-ASCII).
Florent Xiclunac17f1722010-08-08 19:48:29 +00001180# Use "unicode" to return a Unicode string.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001181# @keyparam method Optional output method ("xml", "html", "text" or
1182# "c14n"; default is "xml").
1183# @return A sequence object containing the XML data.
1184# @defreturn sequence
1185# @since 1.3
1186
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001187class _ListDataStream(io.BufferedIOBase):
1188 """ An auxiliary stream accumulating into a list reference
1189 """
1190 def __init__(self, lst):
1191 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001192
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001193 def writable(self):
1194 return True
1195
1196 def seekable(self):
1197 return True
1198
1199 def write(self, b):
1200 self.lst.append(b)
1201
1202 def tell(self):
1203 return len(self.lst)
1204
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001205def tostringlist(element, encoding=None, method=None):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001206 lst = []
1207 stream = _ListDataStream(lst)
1208 ElementTree(element).write(stream, encoding, method=method)
1209 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001210
1211##
1212# Writes an element tree or element structure to sys.stdout. This
1213# function should be used for debugging only.
1214# <p>
1215# The exact output format is implementation dependent. In this
1216# version, it's written as an ordinary XML file.
1217#
1218# @param elem An element tree or an individual element.
1219
1220def dump(elem):
1221 # debugging
1222 if not isinstance(elem, ElementTree):
1223 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001224 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001225 tail = elem.getroot().tail
1226 if not tail or tail[-1] != "\n":
1227 sys.stdout.write("\n")
1228
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001229# --------------------------------------------------------------------
1230# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001231
1232##
1233# Parses an XML document into an element tree.
1234#
1235# @param source A filename or file object containing XML data.
1236# @param parser An optional parser instance. If not given, the
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001237# standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +00001238# @return An ElementTree instance
1239
1240def parse(source, parser=None):
1241 tree = ElementTree()
1242 tree.parse(source, parser)
1243 return tree
1244
1245##
1246# Parses an XML document into an element tree incrementally, and reports
1247# what's going on to the user.
1248#
1249# @param source A filename or file object containing XML data.
1250# @param events A list of events to report back. If omitted, only "end"
1251# events are reported.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001252# @param parser An optional parser instance. If not given, the
1253# standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +00001254# @return A (event, elem) iterator.
1255
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001256def iterparse(source, events=None, parser=None):
Antoine Pitroue033e062010-10-29 10:38:18 +00001257 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001258 if not hasattr(source, "read"):
1259 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001260 close_source = True
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001261 if not parser:
1262 parser = XMLParser(target=TreeBuilder())
Antoine Pitroue033e062010-10-29 10:38:18 +00001263 return _IterParseIterator(source, events, parser, close_source)
Armin Rigo9ed73062005-12-14 18:10:45 +00001264
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001265class _IterParseIterator:
1266
Antoine Pitroue033e062010-10-29 10:38:18 +00001267 def __init__(self, source, events, parser, close_source=False):
Armin Rigo9ed73062005-12-14 18:10:45 +00001268 self._file = source
Antoine Pitroue033e062010-10-29 10:38:18 +00001269 self._close_file = close_source
Armin Rigo9ed73062005-12-14 18:10:45 +00001270 self._events = []
1271 self._index = 0
Florent Xicluna91d51932011-11-01 23:31:09 +01001272 self._error = None
Armin Rigo9ed73062005-12-14 18:10:45 +00001273 self.root = self._root = None
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001274 self._parser = parser
Armin Rigo9ed73062005-12-14 18:10:45 +00001275 # wire up the parser for event reporting
1276 parser = self._parser._parser
1277 append = self._events.append
1278 if events is None:
1279 events = ["end"]
1280 for event in events:
1281 if event == "start":
1282 try:
1283 parser.ordered_attributes = 1
1284 parser.specified_attributes = 1
1285 def handler(tag, attrib_in, event=event, append=append,
1286 start=self._parser._start_list):
1287 append((event, start(tag, attrib_in)))
1288 parser.StartElementHandler = handler
1289 except AttributeError:
1290 def handler(tag, attrib_in, event=event, append=append,
1291 start=self._parser._start):
1292 append((event, start(tag, attrib_in)))
1293 parser.StartElementHandler = handler
1294 elif event == "end":
1295 def handler(tag, event=event, append=append,
1296 end=self._parser._end):
1297 append((event, end(tag)))
1298 parser.EndElementHandler = handler
1299 elif event == "start-ns":
1300 def handler(prefix, uri, event=event, append=append):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001301 append((event, (prefix or "", uri or "")))
Armin Rigo9ed73062005-12-14 18:10:45 +00001302 parser.StartNamespaceDeclHandler = handler
1303 elif event == "end-ns":
1304 def handler(prefix, event=event, append=append):
1305 append((event, None))
1306 parser.EndNamespaceDeclHandler = handler
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001307 else:
1308 raise ValueError("unknown event %r" % event)
Armin Rigo9ed73062005-12-14 18:10:45 +00001309
Georg Brandla18af4e2007-04-21 15:47:16 +00001310 def __next__(self):
Armin Rigo9ed73062005-12-14 18:10:45 +00001311 while 1:
1312 try:
1313 item = self._events[self._index]
Florent Xicluna91d51932011-11-01 23:31:09 +01001314 self._index += 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001315 return item
Florent Xicluna91d51932011-11-01 23:31:09 +01001316 except IndexError:
1317 pass
1318 if self._error:
1319 e = self._error
1320 self._error = None
1321 raise e
1322 if self._parser is None:
1323 self.root = self._root
1324 if self._close_file:
1325 self._file.close()
1326 raise StopIteration
1327 # load event buffer
1328 del self._events[:]
1329 self._index = 0
1330 data = self._file.read(16384)
1331 if data:
1332 try:
1333 self._parser.feed(data)
1334 except SyntaxError as exc:
1335 self._error = exc
1336 else:
1337 self._root = self._parser.close()
1338 self._parser = None
Armin Rigo9ed73062005-12-14 18:10:45 +00001339
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001340 def __iter__(self):
1341 return self
Armin Rigo9ed73062005-12-14 18:10:45 +00001342
1343##
1344# Parses an XML document from a string constant. This function can
1345# be used to embed "XML literals" in Python code.
1346#
1347# @param source A string containing XML data.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001348# @param parser An optional parser instance. If not given, the
1349# standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +00001350# @return An Element instance.
1351# @defreturn Element
1352
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001353def XML(text, parser=None):
1354 if not parser:
1355 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001356 parser.feed(text)
1357 return parser.close()
1358
1359##
1360# Parses an XML document from a string constant, and also returns
1361# a dictionary which maps from element id:s to elements.
1362#
1363# @param source A string containing XML data.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001364# @param parser An optional parser instance. If not given, the
1365# standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +00001366# @return A tuple containing an Element instance and a dictionary.
1367# @defreturn (Element, dictionary)
1368
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001369def XMLID(text, parser=None):
1370 if not parser:
1371 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001372 parser.feed(text)
1373 tree = parser.close()
1374 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001375 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001376 id = elem.get("id")
1377 if id:
1378 ids[id] = elem
1379 return tree, ids
1380
1381##
1382# Parses an XML document from a string constant. Same as {@link #XML}.
1383#
1384# @def fromstring(text)
1385# @param source A string containing XML data.
1386# @return An Element instance.
1387# @defreturn Element
1388
1389fromstring = XML
1390
1391##
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001392# Parses an XML document from a sequence of string fragments.
Armin Rigo9ed73062005-12-14 18:10:45 +00001393#
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001394# @param sequence A list or other sequence containing XML data fragments.
1395# @param parser An optional parser instance. If not given, the
1396# standard {@link XMLParser} parser is used.
1397# @return An Element instance.
1398# @defreturn Element
1399# @since 1.3
Armin Rigo9ed73062005-12-14 18:10:45 +00001400
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001401def fromstringlist(sequence, parser=None):
1402 if not parser:
1403 parser = XMLParser(target=TreeBuilder())
1404 for text in sequence:
1405 parser.feed(text)
1406 return parser.close()
1407
1408# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001409
1410##
1411# Generic element structure builder. This builder converts a sequence
1412# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1413# #TreeBuilder.end} method calls to a well-formed element structure.
1414# <p>
1415# You can use this class to build an element structure using a custom XML
1416# parser, or a parser for some other XML-like format.
1417#
1418# @param element_factory Optional element factory. This factory
1419# is called to create new Element instances, as necessary.
1420
1421class TreeBuilder:
1422
1423 def __init__(self, element_factory=None):
1424 self._data = [] # data collector
1425 self._elem = [] # element stack
1426 self._last = None # last element
1427 self._tail = None # true if we're after an end tag
1428 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001429 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001430 self._factory = element_factory
1431
1432 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001433 # Flushes the builder buffers, and returns the toplevel document
Armin Rigo9ed73062005-12-14 18:10:45 +00001434 # element.
1435 #
1436 # @return An Element instance.
1437 # @defreturn Element
1438
1439 def close(self):
1440 assert len(self._elem) == 0, "missing end tags"
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001441 assert self._last is not None, "missing toplevel element"
Armin Rigo9ed73062005-12-14 18:10:45 +00001442 return self._last
1443
1444 def _flush(self):
1445 if self._data:
1446 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001447 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001448 if self._tail:
1449 assert self._last.tail is None, "internal error (tail)"
1450 self._last.tail = text
1451 else:
1452 assert self._last.text is None, "internal error (text)"
1453 self._last.text = text
1454 self._data = []
1455
1456 ##
1457 # Adds text to the current element.
1458 #
1459 # @param data A string. This should be either an 8-bit string
1460 # containing ASCII text, or a Unicode string.
1461
1462 def data(self, data):
1463 self._data.append(data)
1464
1465 ##
1466 # Opens a new element.
1467 #
1468 # @param tag The element name.
1469 # @param attrib A dictionary containing element attributes.
1470 # @return The opened element.
1471 # @defreturn Element
1472
1473 def start(self, tag, attrs):
1474 self._flush()
1475 self._last = elem = self._factory(tag, attrs)
1476 if self._elem:
1477 self._elem[-1].append(elem)
1478 self._elem.append(elem)
1479 self._tail = 0
1480 return elem
1481
1482 ##
1483 # Closes the current element.
1484 #
1485 # @param tag The element name.
1486 # @return The closed element.
1487 # @defreturn Element
1488
1489 def end(self, tag):
1490 self._flush()
1491 self._last = self._elem.pop()
1492 assert self._last.tag == tag,\
1493 "end tag mismatch (expected %s, got %s)" % (
1494 self._last.tag, tag)
1495 self._tail = 1
1496 return self._last
1497
1498##
1499# Element structure builder for XML source data, based on the
1500# <b>expat</b> parser.
1501#
1502# @keyparam target Target object. If omitted, the builder uses an
1503# instance of the standard {@link #TreeBuilder} class.
1504# @keyparam html Predefine HTML entities. This flag is not supported
1505# by the current implementation.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001506# @keyparam encoding Optional encoding. If given, the value overrides
1507# the encoding specified in the XML file.
Armin Rigo9ed73062005-12-14 18:10:45 +00001508# @see #ElementTree
1509# @see #TreeBuilder
1510
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001511class XMLParser:
Armin Rigo9ed73062005-12-14 18:10:45 +00001512
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001513 def __init__(self, html=0, target=None, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +00001514 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001515 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001516 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001517 try:
1518 import pyexpat as expat
1519 except ImportError:
1520 raise ImportError(
1521 "No module named expat; use SimpleXMLTreeBuilder instead"
1522 )
1523 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001524 if target is None:
1525 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001526 # underscored names are provided for compatibility only
1527 self.parser = self._parser = parser
1528 self.target = self._target = target
1529 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001530 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001531 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001532 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001533 if hasattr(target, 'start'):
1534 parser.StartElementHandler = self._start
1535 if hasattr(target, 'end'):
1536 parser.EndElementHandler = self._end
1537 if hasattr(target, 'data'):
1538 parser.CharacterDataHandler = target.data
1539 # miscellaneous callbacks
1540 if hasattr(target, 'comment'):
1541 parser.CommentHandler = target.comment
1542 if hasattr(target, 'pi'):
1543 parser.ProcessingInstructionHandler = target.pi
Armin Rigo9ed73062005-12-14 18:10:45 +00001544 # let expat do the buffering, if supported
1545 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001546 parser.buffer_text = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001547 except AttributeError:
1548 pass
1549 # use new-style attribute handling, if supported
1550 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001551 parser.ordered_attributes = 1
1552 parser.specified_attributes = 1
1553 if hasattr(target, 'start'):
1554 parser.StartElementHandler = self._start_list
Armin Rigo9ed73062005-12-14 18:10:45 +00001555 except AttributeError:
1556 pass
Armin Rigo9ed73062005-12-14 18:10:45 +00001557 self._doctype = None
1558 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001559 try:
1560 self.version = "Expat %d.%d.%d" % expat.version_info
1561 except AttributeError:
1562 pass # unknown
1563
1564 def _raiseerror(self, value):
1565 err = ParseError(value)
1566 err.code = value.code
1567 err.position = value.lineno, value.offset
1568 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001569
Armin Rigo9ed73062005-12-14 18:10:45 +00001570 def _fixname(self, key):
1571 # expand qname, and convert name string to ascii, if possible
1572 try:
1573 name = self._names[key]
1574 except KeyError:
1575 name = key
1576 if "}" in name:
1577 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001578 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001579 return name
1580
1581 def _start(self, tag, attrib_in):
1582 fixname = self._fixname
1583 tag = fixname(tag)
1584 attrib = {}
1585 for key, value in attrib_in.items():
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001586 attrib[fixname(key)] = value
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001587 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001588
1589 def _start_list(self, tag, attrib_in):
1590 fixname = self._fixname
1591 tag = fixname(tag)
1592 attrib = {}
1593 if attrib_in:
1594 for i in range(0, len(attrib_in), 2):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001595 attrib[fixname(attrib_in[i])] = attrib_in[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001596 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001597
Armin Rigo9ed73062005-12-14 18:10:45 +00001598 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001599 return self.target.end(self._fixname(tag))
1600
Armin Rigo9ed73062005-12-14 18:10:45 +00001601 def _default(self, text):
1602 prefix = text[:1]
1603 if prefix == "&":
1604 # deal with undefined entities
1605 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001606 data_handler = self.target.data
1607 except AttributeError:
1608 return
1609 try:
1610 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001611 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001612 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001613 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001614 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001615 (text, self.parser.ErrorLineNumber,
1616 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001617 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001618 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001619 err.lineno = self.parser.ErrorLineNumber
1620 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001621 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001622 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1623 self._doctype = [] # inside a doctype declaration
1624 elif self._doctype is not None:
1625 # parse doctype contents
1626 if prefix == ">":
1627 self._doctype = None
1628 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001629 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001630 if not text:
1631 return
1632 self._doctype.append(text)
1633 n = len(self._doctype)
1634 if n > 2:
1635 type = self._doctype[1]
1636 if type == "PUBLIC" and n == 4:
1637 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001638 if pubid:
1639 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001640 elif type == "SYSTEM" and n == 3:
1641 name, type, system = self._doctype
1642 pubid = None
1643 else:
1644 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001645 if hasattr(self.target, "doctype"):
1646 self.target.doctype(name, pubid, system[1:-1])
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001647 elif self.doctype != self._XMLParser__doctype:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001648 # warn about deprecated call
1649 self._XMLParser__doctype(name, pubid, system[1:-1])
1650 self.doctype(name, pubid, system[1:-1])
Armin Rigo9ed73062005-12-14 18:10:45 +00001651 self._doctype = None
1652
1653 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001654 # (Deprecated) Handles a doctype declaration.
Armin Rigo9ed73062005-12-14 18:10:45 +00001655 #
1656 # @param name Doctype name.
1657 # @param pubid Public identifier.
1658 # @param system System identifier.
1659
1660 def doctype(self, name, pubid, system):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001661 """This method of XMLParser is deprecated."""
1662 warnings.warn(
1663 "This method of XMLParser is deprecated. Define doctype() "
1664 "method on the TreeBuilder target.",
1665 DeprecationWarning,
1666 )
1667
1668 # sentinel, if doctype is redefined in a subclass
1669 __doctype = doctype
Armin Rigo9ed73062005-12-14 18:10:45 +00001670
1671 ##
1672 # Feeds data to the parser.
1673 #
1674 # @param data Encoded data.
1675
1676 def feed(self, data):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001677 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001678 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001679 except self._error as v:
1680 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001681
1682 ##
1683 # Finishes feeding data to the parser.
1684 #
1685 # @return An element structure.
1686 # @defreturn Element
1687
1688 def close(self):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001689 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001690 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001691 except self._error as v:
1692 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001693 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001694 close_handler = self.target.close
1695 except AttributeError:
1696 pass
1697 else:
1698 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001699 finally:
1700 # get rid of circular references
1701 del self.parser, self._parser
1702 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001703
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001704
1705# Import the C accelerators
1706try:
1707 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1708 from _elementtree import *
1709except ImportError:
1710 pass
1711else:
1712 # Overwrite 'ElementTree.parse' and 'iterparse' to use the C XMLParser
1713
1714 class ElementTree(ElementTree):
1715 def parse(self, source, parser=None):
1716 close_source = False
1717 if not hasattr(source, 'read'):
1718 source = open(source, 'rb')
1719 close_source = True
1720 try:
1721 if parser is not None:
1722 while True:
1723 data = source.read(65536)
1724 if not data:
1725 break
1726 parser.feed(data)
1727 self._root = parser.close()
1728 else:
1729 parser = XMLParser()
1730 self._root = parser._parse(source)
1731 return self._root
1732 finally:
1733 if close_source:
1734 source.close()
1735
1736 class iterparse:
Eli Benderskyaaa97802013-01-24 07:15:19 -08001737 """Parses an XML section into an element tree incrementally.
1738
1739 Reports what’s going on to the user. 'source' is a filename or file
1740 object containing XML data. 'events' is a list of events to report back.
1741 The supported events are the strings "start", "end", "start-ns" and
1742 "end-ns" (the "ns" events are used to get detailed namespace
1743 information). If 'events' is omitted, only "end" events are reported.
1744 'parser' is an optional parser instance. If not given, the standard
1745 XMLParser parser is used. Returns an iterator providing
1746 (event, elem) pairs.
1747 """
1748
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001749 root = None
Eli Benderskyaaa97802013-01-24 07:15:19 -08001750 def __init__(self, file, events=None, parser=None):
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001751 self._close_file = False
1752 if not hasattr(file, 'read'):
1753 file = open(file, 'rb')
1754 self._close_file = True
1755 self._file = file
1756 self._events = []
1757 self._index = 0
1758 self._error = None
1759 self.root = self._root = None
Eli Benderskyaaa97802013-01-24 07:15:19 -08001760 if parser is None:
1761 parser = XMLParser(target=TreeBuilder())
1762 self._parser = parser
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001763 self._parser._setevents(self._events, events)
1764
1765 def __next__(self):
1766 while True:
1767 try:
1768 item = self._events[self._index]
1769 self._index += 1
1770 return item
1771 except IndexError:
1772 pass
1773 if self._error:
1774 e = self._error
1775 self._error = None
1776 raise e
1777 if self._parser is None:
1778 self.root = self._root
1779 if self._close_file:
1780 self._file.close()
1781 raise StopIteration
1782 # load event buffer
1783 del self._events[:]
1784 self._index = 0
1785 data = self._file.read(16384)
1786 if data:
1787 try:
1788 self._parser.feed(data)
1789 except SyntaxError as exc:
1790 self._error = exc
1791 else:
1792 self._root = self._parser.close()
1793 self._parser = None
1794
1795 def __iter__(self):
1796 return self
1797
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001798# compatibility
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001799XMLTreeBuilder = XMLParser
1800
1801# workaround circular import.
1802try:
1803 from ElementC14N import _serialize_c14n
1804 _serialize["c14n"] = _serialize_c14n
1805except ImportError:
1806 pass