blob: 641d787dab3e3f0d531000811b8e583ecf41d1e6 [file] [log] [blame]
Armin Rigo9ed73062005-12-14 18:10:45 +00001#
2# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +00003# $Id: ElementTree.py 3440 2008-07-18 14:45:01Z fredrik $
Armin Rigo9ed73062005-12-14 18:10:45 +00004#
Florent Xiclunaf15351d2010-03-13 23:24:31 +00005# light-weight XML support for Python 2.3 and later.
Armin Rigo9ed73062005-12-14 18:10:45 +00006#
Florent Xiclunaf15351d2010-03-13 23:24:31 +00007# history (since 1.2.6):
8# 2005-11-12 fl added tostringlist/fromstringlist helpers
9# 2006-07-05 fl merged in selected changes from the 1.3 sandbox
10# 2006-07-05 fl removed support for 2.1 and earlier
11# 2007-06-21 fl added deprecation/future warnings
12# 2007-08-25 fl added doctype hook, added parser version attribute etc
13# 2007-08-26 fl added new serializer code (better namespace handling, etc)
14# 2007-08-27 fl warn for broken /tag searches on tree level
15# 2007-09-02 fl added html/text methods to serializer (experimental)
16# 2007-09-05 fl added method argument to tostring/tostringlist
17# 2007-09-06 fl improved error handling
18# 2007-09-13 fl added itertext, iterfind; assorted cleanups
19# 2007-12-15 fl added C14N hooks, copy method (experimental)
Armin Rigo9ed73062005-12-14 18:10:45 +000020#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000021# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000022#
23# fredrik@pythonware.com
24# http://www.pythonware.com
25#
26# --------------------------------------------------------------------
27# The ElementTree toolkit is
28#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000029# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000030#
31# By obtaining, using, and/or copying this software and/or its
32# associated documentation, you agree that you have read, understood,
33# and will comply with the following terms and conditions:
34#
35# Permission to use, copy, modify, and distribute this software and
36# its associated documentation for any purpose and without fee is
37# hereby granted, provided that the above copyright notice appears in
38# all copies, and that both that copyright notice and this permission
39# notice appear in supporting documentation, and that the name of
40# Secret Labs AB or the author not be used in advertising or publicity
41# pertaining to distribution of the software without specific, written
42# prior permission.
43#
44# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
45# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
46# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
47# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
48# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
49# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
50# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
51# OF THIS SOFTWARE.
52# --------------------------------------------------------------------
53
Fredrik Lundh63168a52005-12-14 22:29:34 +000054# Licensed to PSF under a Contributor Agreement.
Florent Xiclunaf15351d2010-03-13 23:24:31 +000055# See http://www.python.org/psf/license for licensing details.
Fredrik Lundh63168a52005-12-14 22:29:34 +000056
Armin Rigo9ed73062005-12-14 18:10:45 +000057__all__ = [
58 # public symbols
59 "Comment",
60 "dump",
61 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000062 "fromstring", "fromstringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000063 "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000064 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000065 "PI", "ProcessingInstruction",
66 "QName",
67 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000068 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000069 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000070 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010071 "XML", "XMLID",
Thomas Wouters0e3f5912006-08-11 14:57:12 +000072 "XMLParser", "XMLTreeBuilder",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010073 "register_namespace",
Armin Rigo9ed73062005-12-14 18:10:45 +000074 ]
75
Florent Xiclunaf15351d2010-03-13 23:24:31 +000076VERSION = "1.3.0"
77
Armin Rigo9ed73062005-12-14 18:10:45 +000078##
79# The <b>Element</b> type is a flexible container object, designed to
80# store hierarchical data structures in memory. The type can be
81# described as a cross between a list and a dictionary.
82# <p>
83# Each element has a number of properties associated with it:
84# <ul>
85# <li>a <i>tag</i>. This is a string identifying what kind of data
86# this element represents (the element type, in other words).</li>
87# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
88# <li>a <i>text</i> string.</li>
89# <li>an optional <i>tail</i> string.</li>
90# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
91# </ul>
92#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000093# To create an element instance, use the {@link #Element} constructor
94# or the {@link #SubElement} factory function.
Armin Rigo9ed73062005-12-14 18:10:45 +000095# <p>
96# The {@link #ElementTree} class can be used to wrap an element
97# structure, and convert it from and to XML.
98##
99
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000100import sys
101import re
102import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +0300103import io
104import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +0000105
Eli Bendersky27cbb192012-06-15 09:03:19 +0300106from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000107
Armin Rigo9ed73062005-12-14 18:10:45 +0000108
109##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000110# Parser error. This is a subclass of <b>SyntaxError</b>.
Armin Rigo9ed73062005-12-14 18:10:45 +0000111# <p>
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000112# In addition to the exception value, an exception instance contains a
113# specific exception code in the <b>code</b> attribute, and the line and
114# column of the error in the <b>position</b> attribute.
115
116class ParseError(SyntaxError):
117 pass
118
119# --------------------------------------------------------------------
120
121##
122# Checks if an object appears to be a valid element object.
Armin Rigo9ed73062005-12-14 18:10:45 +0000123#
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000124# @param An element instance.
125# @return A true value if this is an element object.
126# @defreturn flag
127
128def iselement(element):
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100129 # FIXME: not sure about this;
130 # isinstance(element, Element) or look for tag/attrib/text attributes
131 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000132
133##
134# Element class. This class defines the Element interface, and
135# provides a reference implementation of this interface.
136# <p>
137# The element name, attribute names, and attribute values can be
138# either ASCII strings (ordinary Python strings containing only 7-bit
139# ASCII characters) or Unicode strings.
140#
141# @param tag The element name.
142# @param attrib An optional dictionary, containing element attributes.
143# @param **extra Additional attributes, given as keyword arguments.
Armin Rigo9ed73062005-12-14 18:10:45 +0000144# @see Element
145# @see SubElement
146# @see Comment
147# @see ProcessingInstruction
148
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000149class Element:
Armin Rigo9ed73062005-12-14 18:10:45 +0000150 # <tag attrib>text<child/>...</tag>tail
151
152 ##
153 # (Attribute) Element tag.
154
155 tag = None
156
157 ##
158 # (Attribute) Element attribute dictionary. Where possible, use
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000159 # {@link #Element.get},
160 # {@link #Element.set},
161 # {@link #Element.keys}, and
162 # {@link #Element.items} to access
Armin Rigo9ed73062005-12-14 18:10:45 +0000163 # element attributes.
164
165 attrib = None
166
167 ##
168 # (Attribute) Text before first subelement. This is either a
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000169 # string or the value None. Note that if there was no text, this
170 # attribute may be either None or an empty string, depending on
171 # the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000172
173 text = None
174
175 ##
176 # (Attribute) Text after this element's end tag, but before the
177 # next sibling element's start tag. This is either a string or
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000178 # the value None. Note that if there was no text, this attribute
179 # may be either None or an empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000180
181 tail = None # text after end tag, if any
182
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000183 # constructor
184
185 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300186 if not isinstance(attrib, dict):
187 raise TypeError("attrib must be dict, not %s" % (
188 attrib.__class__.__name__,))
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000189 attrib = attrib.copy()
190 attrib.update(extra)
Armin Rigo9ed73062005-12-14 18:10:45 +0000191 self.tag = tag
192 self.attrib = attrib
193 self._children = []
194
195 def __repr__(self):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000196 return "<Element %s at 0x%x>" % (repr(self.tag), id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000197
198 ##
199 # Creates a new element object of the same type as this element.
200 #
201 # @param tag Element tag.
202 # @param attrib Element attributes, given as a dictionary.
203 # @return A new element instance.
204
205 def makeelement(self, tag, attrib):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000206 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000207
208 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000209 # (Experimental) Copies the current element. This creates a
210 # shallow copy; subelements will be shared with the original tree.
211 #
212 # @return A new element instance.
213
214 def copy(self):
215 elem = self.makeelement(self.tag, self.attrib)
216 elem.text = self.text
217 elem.tail = self.tail
218 elem[:] = self
219 return elem
220
221 ##
222 # Returns the number of subelements. Note that this only counts
223 # full elements; to check if there's any content in an element, you
224 # have to check both the length and the <b>text</b> attribute.
Armin Rigo9ed73062005-12-14 18:10:45 +0000225 #
226 # @return The number of subelements.
227
228 def __len__(self):
229 return len(self._children)
230
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000231 def __bool__(self):
232 warnings.warn(
233 "The behavior of this method will change in future versions. "
234 "Use specific 'len(elem)' or 'elem is not None' test instead.",
235 FutureWarning, stacklevel=2
236 )
237 return len(self._children) != 0 # emulate old behaviour, for now
238
Armin Rigo9ed73062005-12-14 18:10:45 +0000239 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000240 # Returns the given subelement, by index.
Armin Rigo9ed73062005-12-14 18:10:45 +0000241 #
242 # @param index What subelement to return.
243 # @return The given subelement.
244 # @exception IndexError If the given element does not exist.
245
246 def __getitem__(self, index):
247 return self._children[index]
248
249 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000250 # Replaces the given subelement, by index.
Armin Rigo9ed73062005-12-14 18:10:45 +0000251 #
252 # @param index What subelement to replace.
253 # @param element The new element value.
254 # @exception IndexError If the given element does not exist.
Armin Rigo9ed73062005-12-14 18:10:45 +0000255
256 def __setitem__(self, index, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000257 # if isinstance(index, slice):
258 # for elt in element:
259 # assert iselement(elt)
260 # else:
261 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000262 self._children[index] = element
263
264 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000265 # Deletes the given subelement, by index.
Armin Rigo9ed73062005-12-14 18:10:45 +0000266 #
267 # @param index What subelement to delete.
268 # @exception IndexError If the given element does not exist.
269
270 def __delitem__(self, index):
271 del self._children[index]
272
273 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000274 # Adds a subelement to the end of this element. In document order,
275 # the new element will appear after the last existing subelement (or
276 # directly after the text, if it's the first subelement), but before
277 # the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000278 #
279 # @param element The element to add.
Armin Rigo9ed73062005-12-14 18:10:45 +0000280
281 def append(self, element):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200282 self._assert_is_element(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000283 self._children.append(element)
284
285 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000286 # Appends subelements from a sequence.
287 #
288 # @param elements A sequence object with zero or more elements.
289 # @since 1.3
290
291 def extend(self, elements):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200292 for element in elements:
293 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000294 self._children.extend(elements)
295
296 ##
Armin Rigo9ed73062005-12-14 18:10:45 +0000297 # Inserts a subelement at the given position in this element.
298 #
299 # @param index Where to insert the new subelement.
Armin Rigo9ed73062005-12-14 18:10:45 +0000300
301 def insert(self, index, element):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200302 self._assert_is_element(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000303 self._children.insert(index, element)
304
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200305 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200306 # Need to refer to the actual Python implementation, not the
307 # shadowing C implementation.
308 if not isinstance(e, _Element):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200309 raise TypeError('expected an Element, not %s' % type(e).__name__)
310
Armin Rigo9ed73062005-12-14 18:10:45 +0000311 ##
312 # Removes a matching subelement. Unlike the <b>find</b> methods,
313 # this method compares elements based on identity, not on tag
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000314 # value or contents. To remove subelements by other means, the
315 # easiest way is often to use a list comprehension to select what
316 # elements to keep, and use slice assignment to update the parent
317 # element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000318 #
319 # @param element What element to remove.
320 # @exception ValueError If a matching element could not be found.
Armin Rigo9ed73062005-12-14 18:10:45 +0000321
322 def remove(self, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000323 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000324 self._children.remove(element)
325
326 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000327 # (Deprecated) Returns all subelements. The elements are returned
328 # in document order.
Armin Rigo9ed73062005-12-14 18:10:45 +0000329 #
330 # @return A list of subelements.
331 # @defreturn list of Element instances
332
333 def getchildren(self):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000334 warnings.warn(
335 "This method will be removed in future versions. "
336 "Use 'list(elem)' or iteration over elem instead.",
337 DeprecationWarning, stacklevel=2
338 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000339 return self._children
340
341 ##
342 # Finds the first matching subelement, by tag name or path.
343 #
344 # @param path What element to look for.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000345 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000346 # @return The first matching element, or None if no element was found.
347 # @defreturn Element or None
348
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000349 def find(self, path, namespaces=None):
350 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000351
352 ##
353 # Finds text for the first matching subelement, by tag name or path.
354 #
355 # @param path What element to look for.
356 # @param default What to return if the element was not found.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000357 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000358 # @return The text content of the first matching element, or the
359 # default value no element was found. Note that if the element
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000360 # is found, but has no text content, this method returns an
Armin Rigo9ed73062005-12-14 18:10:45 +0000361 # empty string.
362 # @defreturn string
363
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000364 def findtext(self, path, default=None, namespaces=None):
365 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000366
367 ##
368 # Finds all matching subelements, by tag name or path.
369 #
370 # @param path What element to look for.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000371 # @keyparam namespaces Optional namespace prefix map.
372 # @return A list or other sequence containing all matching elements,
Armin Rigo9ed73062005-12-14 18:10:45 +0000373 # in document order.
374 # @defreturn list of Element instances
375
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000376 def findall(self, path, namespaces=None):
377 return ElementPath.findall(self, path, namespaces)
378
379 ##
380 # Finds all matching subelements, by tag name or path.
381 #
382 # @param path What element to look for.
383 # @keyparam namespaces Optional namespace prefix map.
384 # @return An iterator or sequence containing all matching elements,
385 # in document order.
386 # @defreturn a generated sequence of Element instances
387
388 def iterfind(self, path, namespaces=None):
389 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000390
391 ##
392 # Resets an element. This function removes all subelements, clears
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000393 # all attributes, and sets the <b>text</b> and <b>tail</b> attributes
394 # to None.
Armin Rigo9ed73062005-12-14 18:10:45 +0000395
396 def clear(self):
397 self.attrib.clear()
398 self._children = []
399 self.text = self.tail = None
400
401 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000402 # Gets an element attribute. Equivalent to <b>attrib.get</b>, but
403 # some implementations may handle this a bit more efficiently.
Armin Rigo9ed73062005-12-14 18:10:45 +0000404 #
405 # @param key What attribute to look for.
406 # @param default What to return if the attribute was not found.
407 # @return The attribute value, or the default value, if the
408 # attribute was not found.
409 # @defreturn string or None
410
411 def get(self, key, default=None):
412 return self.attrib.get(key, default)
413
414 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000415 # Sets an element attribute. Equivalent to <b>attrib[key] = value</b>,
416 # but some implementations may handle this a bit more efficiently.
Armin Rigo9ed73062005-12-14 18:10:45 +0000417 #
418 # @param key What attribute to set.
419 # @param value The attribute value.
420
421 def set(self, key, value):
422 self.attrib[key] = value
423
424 ##
425 # Gets a list of attribute names. The names are returned in an
426 # arbitrary order (just like for an ordinary Python dictionary).
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000427 # Equivalent to <b>attrib.keys()</b>.
Armin Rigo9ed73062005-12-14 18:10:45 +0000428 #
429 # @return A list of element attribute names.
430 # @defreturn list of strings
431
432 def keys(self):
433 return self.attrib.keys()
434
435 ##
436 # Gets element attributes, as a sequence. The attributes are
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000437 # returned in an arbitrary order. Equivalent to <b>attrib.items()</b>.
Armin Rigo9ed73062005-12-14 18:10:45 +0000438 #
439 # @return A list of (name, value) tuples for all attributes.
440 # @defreturn list of (string, string) tuples
441
442 def items(self):
443 return self.attrib.items()
444
445 ##
446 # Creates a tree iterator. The iterator loops over this element
447 # and all subelements, in document order, and returns all elements
448 # with a matching tag.
449 # <p>
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000450 # If the tree structure is modified during iteration, new or removed
451 # elements may or may not be included. To get a stable set, use the
452 # list() function on the iterator, and loop over the resulting list.
Armin Rigo9ed73062005-12-14 18:10:45 +0000453 #
454 # @param tag What tags to look for (default is to return all elements).
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000455 # @return An iterator containing all the matching elements.
456 # @defreturn iterator
Armin Rigo9ed73062005-12-14 18:10:45 +0000457
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000458 def iter(self, tag=None):
Armin Rigo9ed73062005-12-14 18:10:45 +0000459 if tag == "*":
460 tag = None
461 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000462 yield self
463 for e in self._children:
464 for e in e.iter(tag):
465 yield e
466
467 # compatibility
468 def getiterator(self, tag=None):
469 # Change for a DeprecationWarning in 1.4
470 warnings.warn(
471 "This method will be removed in future versions. "
472 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
473 PendingDeprecationWarning, stacklevel=2
474 )
475 return list(self.iter(tag))
476
477 ##
478 # Creates a text iterator. The iterator loops over this element
479 # and all subelements, in document order, and returns all inner
480 # text.
481 #
482 # @return An iterator containing all inner text.
483 # @defreturn iterator
484
485 def itertext(self):
486 tag = self.tag
487 if not isinstance(tag, str) and tag is not None:
488 return
489 if self.text:
490 yield self.text
491 for e in self:
492 for s in e.itertext():
493 yield s
494 if e.tail:
495 yield e.tail
Armin Rigo9ed73062005-12-14 18:10:45 +0000496
497# compatibility
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000498_Element = _ElementInterface = Element
Armin Rigo9ed73062005-12-14 18:10:45 +0000499
500##
501# Subelement factory. This function creates an element instance, and
502# appends it to an existing element.
503# <p>
504# The element name, attribute names, and attribute values can be
505# either 8-bit ASCII strings or Unicode strings.
506#
507# @param parent The parent element.
508# @param tag The subelement name.
509# @param attrib An optional dictionary, containing element attributes.
510# @param **extra Additional attributes, given as keyword arguments.
511# @return An element instance.
512# @defreturn Element
513
514def SubElement(parent, tag, attrib={}, **extra):
515 attrib = attrib.copy()
516 attrib.update(extra)
517 element = parent.makeelement(tag, attrib)
518 parent.append(element)
519 return element
520
521##
522# Comment element factory. This factory function creates a special
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000523# element that will be serialized as an XML comment by the standard
524# serializer.
Armin Rigo9ed73062005-12-14 18:10:45 +0000525# <p>
526# The comment string can be either an 8-bit ASCII string or a Unicode
527# string.
528#
529# @param text A string containing the comment string.
530# @return An element instance, representing a comment.
531# @defreturn Element
532
533def Comment(text=None):
534 element = Element(Comment)
535 element.text = text
536 return element
537
538##
539# PI element factory. This factory function creates a special element
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000540# that will be serialized as an XML processing instruction by the standard
541# serializer.
Armin Rigo9ed73062005-12-14 18:10:45 +0000542#
543# @param target A string containing the PI target.
544# @param text A string containing the PI contents, if any.
545# @return An element instance, representing a PI.
546# @defreturn Element
547
548def ProcessingInstruction(target, text=None):
549 element = Element(ProcessingInstruction)
550 element.text = target
551 if text:
552 element.text = element.text + " " + text
553 return element
554
555PI = ProcessingInstruction
556
557##
558# QName wrapper. This can be used to wrap a QName attribute value, in
559# order to get proper namespace handling on output.
560#
561# @param text A string containing the QName value, in the form {uri}local,
562# or, if the tag argument is given, the URI part of a QName.
563# @param tag Optional tag. If given, the first argument is interpreted as
564# an URI, and this argument is interpreted as a local name.
565# @return An opaque object, representing the QName.
566
567class QName:
568 def __init__(self, text_or_uri, tag=None):
569 if tag:
570 text_or_uri = "{%s}%s" % (text_or_uri, tag)
571 self.text = text_or_uri
572 def __str__(self):
573 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000574 def __repr__(self):
Georg Brandlc95c9182010-12-09 18:26:02 +0000575 return '<QName %r>' % (self.text,)
Armin Rigo9ed73062005-12-14 18:10:45 +0000576 def __hash__(self):
577 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000578 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000579 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000580 return self.text <= other.text
581 return self.text <= other
582 def __lt__(self, other):
583 if isinstance(other, QName):
584 return self.text < other.text
585 return self.text < other
586 def __ge__(self, other):
587 if isinstance(other, QName):
588 return self.text >= other.text
589 return self.text >= other
590 def __gt__(self, other):
591 if isinstance(other, QName):
592 return self.text > other.text
593 return self.text > other
594 def __eq__(self, other):
595 if isinstance(other, QName):
596 return self.text == other.text
597 return self.text == other
598 def __ne__(self, other):
599 if isinstance(other, QName):
600 return self.text != other.text
601 return self.text != other
Armin Rigo9ed73062005-12-14 18:10:45 +0000602
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000603# --------------------------------------------------------------------
604
Armin Rigo9ed73062005-12-14 18:10:45 +0000605##
606# ElementTree wrapper class. This class represents an entire element
607# hierarchy, and adds some extra support for serialization to and from
608# standard XML.
609#
610# @param element Optional root element.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000611# @keyparam file Optional file handle or file name. If given, the
Armin Rigo9ed73062005-12-14 18:10:45 +0000612# tree is initialized with the contents of this XML file.
613
614class ElementTree:
615
616 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000617 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000618 self._root = element # first node
619 if file:
620 self.parse(file)
621
622 ##
623 # Gets the root element for this tree.
624 #
625 # @return An element instance.
626 # @defreturn Element
627
628 def getroot(self):
629 return self._root
630
631 ##
632 # Replaces the root element for this tree. This discards the
633 # current contents of the tree, and replaces it with the given
634 # element. Use with care.
635 #
636 # @param element An element instance.
637
638 def _setroot(self, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000639 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000640 self._root = element
641
642 ##
643 # Loads an external XML document into this element tree.
644 #
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000645 # @param source A file name or file object. If a file object is
646 # given, it only has to implement a <b>read(n)</b> method.
647 # @keyparam parser An optional parser instance. If not given, the
648 # standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +0000649 # @return The document root element.
650 # @defreturn Element
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000651 # @exception ParseError If the parser fails to parse the document.
Armin Rigo9ed73062005-12-14 18:10:45 +0000652
653 def parse(self, source, parser=None):
Antoine Pitroue033e062010-10-29 10:38:18 +0000654 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000655 if not hasattr(source, "read"):
656 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000657 close_source = True
658 try:
659 if not parser:
660 parser = XMLParser(target=TreeBuilder())
661 while 1:
662 data = source.read(65536)
663 if not data:
664 break
665 parser.feed(data)
666 self._root = parser.close()
667 return self._root
668 finally:
669 if close_source:
670 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000671
672 ##
673 # Creates a tree iterator for the root element. The iterator loops
674 # over all elements in this tree, in document order.
675 #
676 # @param tag What tags to look for (default is to return all elements)
677 # @return An iterator.
678 # @defreturn iterator
679
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000680 def iter(self, tag=None):
681 # assert self._root is not None
682 return self._root.iter(tag)
683
684 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000685 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000686 # Change for a DeprecationWarning in 1.4
687 warnings.warn(
688 "This method will be removed in future versions. "
689 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
690 PendingDeprecationWarning, stacklevel=2
691 )
692 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000693
694 ##
695 # Finds the first toplevel element with given tag.
696 # Same as getroot().find(path).
697 #
698 # @param path What element to look for.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000699 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000700 # @return The first matching element, or None if no element was found.
701 # @defreturn Element or None
702
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000703 def find(self, path, namespaces=None):
704 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000705 if path[:1] == "/":
706 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000707 warnings.warn(
708 "This search is broken in 1.3 and earlier, and will be "
709 "fixed in a future version. If you rely on the current "
710 "behaviour, change it to %r" % path,
711 FutureWarning, stacklevel=2
712 )
713 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000714
715 ##
716 # Finds the element text for the first toplevel element with given
717 # tag. Same as getroot().findtext(path).
718 #
719 # @param path What toplevel element to look for.
720 # @param default What to return if the element was not found.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000721 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000722 # @return The text content of the first matching element, or the
723 # default value no element was found. Note that if the element
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000724 # is found, but has no text content, this method returns an
Armin Rigo9ed73062005-12-14 18:10:45 +0000725 # empty string.
726 # @defreturn string
727
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000728 def findtext(self, path, default=None, namespaces=None):
729 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000730 if path[:1] == "/":
731 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000732 warnings.warn(
733 "This search is broken in 1.3 and earlier, and will be "
734 "fixed in a future version. If you rely on the current "
735 "behaviour, change it to %r" % path,
736 FutureWarning, stacklevel=2
737 )
738 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000739
740 ##
741 # Finds all toplevel elements with the given tag.
742 # Same as getroot().findall(path).
743 #
744 # @param path What element to look for.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000745 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000746 # @return A list or iterator containing all matching elements,
747 # in document order.
748 # @defreturn list of Element instances
749
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000750 def findall(self, path, namespaces=None):
751 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000752 if path[:1] == "/":
753 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000754 warnings.warn(
755 "This search is broken in 1.3 and earlier, and will be "
756 "fixed in a future version. If you rely on the current "
757 "behaviour, change it to %r" % path,
758 FutureWarning, stacklevel=2
759 )
760 return self._root.findall(path, namespaces)
761
762 ##
763 # Finds all matching subelements, by tag name or path.
764 # Same as getroot().iterfind(path).
765 #
766 # @param path What element to look for.
767 # @keyparam namespaces Optional namespace prefix map.
768 # @return An iterator or sequence containing all matching elements,
769 # in document order.
770 # @defreturn a generated sequence of Element instances
771
772 def iterfind(self, path, namespaces=None):
773 # assert self._root is not None
774 if path[:1] == "/":
775 path = "." + path
776 warnings.warn(
777 "This search is broken in 1.3 and earlier, and will be "
778 "fixed in a future version. If you rely on the current "
779 "behaviour, change it to %r" % path,
780 FutureWarning, stacklevel=2
781 )
782 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000783
784 ##
785 # Writes the element tree to a file, as XML.
786 #
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000787 # @def write(file, **options)
Armin Rigo9ed73062005-12-14 18:10:45 +0000788 # @param file A file name, or a file object opened for writing.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000789 # @param **options Options, given as keyword arguments.
Florent Xiclunac17f1722010-08-08 19:48:29 +0000790 # @keyparam encoding Optional output encoding (default is US-ASCII).
791 # Use "unicode" to return a Unicode string.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000792 # @keyparam xml_declaration Controls if an XML declaration should
793 # be added to the file. Use False for never, True for always,
Florent Xiclunac17f1722010-08-08 19:48:29 +0000794 # None for only if not US-ASCII or UTF-8 or Unicode. None is default.
Serhiy Storchaka03530b92013-01-13 21:58:04 +0200795 # @keyparam default_namespace Sets the default XML namespace (for "xmlns").
796 # @keyparam method Optional output method ("xml", "html", "text" or
797 # "c14n"; default is "xml").
Armin Rigo9ed73062005-12-14 18:10:45 +0000798
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000799 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000800 encoding=None,
801 xml_declaration=None,
802 default_namespace=None,
803 method=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000804 if not method:
805 method = "xml"
806 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000807 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000808 if not encoding:
809 if method == "c14n":
810 encoding = "utf-8"
811 else:
812 encoding = "us-ascii"
Florent Xiclunac17f1722010-08-08 19:48:29 +0000813 else:
814 encoding = encoding.lower()
Eli Bendersky00f402b2012-07-15 06:02:22 +0300815 with _get_writer(file_or_filename, encoding) as write:
816 if method == "xml" and (xml_declaration or
817 (xml_declaration is None and
818 encoding not in ("utf-8", "us-ascii", "unicode"))):
819 declared_encoding = encoding
820 if encoding == "unicode":
821 # Retrieve the default encoding for the xml declaration
822 import locale
823 declared_encoding = locale.getpreferredencoding()
824 write("<?xml version='1.0' encoding='%s'?>\n" % (
825 declared_encoding,))
826 if method == "text":
827 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000828 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300829 qnames, namespaces = _namespaces(self._root, default_namespace)
830 serialize = _serialize[method]
831 serialize(write, self._root, qnames, namespaces)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000832
833 def write_c14n(self, file):
834 # lxml.etree compatibility. use output method instead
835 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000836
837# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000838# serialization support
839
Eli Bendersky00f402b2012-07-15 06:02:22 +0300840@contextlib.contextmanager
841def _get_writer(file_or_filename, encoding):
842 # returns text write method and release all resourses after using
843 try:
844 write = file_or_filename.write
845 except AttributeError:
846 # file_or_filename is a file name
847 if encoding == "unicode":
848 file = open(file_or_filename, "w")
849 else:
850 file = open(file_or_filename, "w", encoding=encoding,
851 errors="xmlcharrefreplace")
852 with file:
853 yield file.write
854 else:
855 # file_or_filename is a file-like object
856 # encoding determines if it is a text or binary writer
857 if encoding == "unicode":
858 # use a text writer as is
859 yield write
860 else:
861 # wrap a binary writer with TextIOWrapper
862 with contextlib.ExitStack() as stack:
863 if isinstance(file_or_filename, io.BufferedIOBase):
864 file = file_or_filename
865 elif isinstance(file_or_filename, io.RawIOBase):
866 file = io.BufferedWriter(file_or_filename)
867 # Keep the original file open when the BufferedWriter is
868 # destroyed
869 stack.callback(file.detach)
870 else:
871 # This is to handle passed objects that aren't in the
872 # IOBase hierarchy, but just have a write method
873 file = io.BufferedIOBase()
874 file.writable = lambda: True
875 file.write = write
876 try:
877 # TextIOWrapper uses this methods to determine
878 # if BOM (for UTF-16, etc) should be added
879 file.seekable = file_or_filename.seekable
880 file.tell = file_or_filename.tell
881 except AttributeError:
882 pass
883 file = io.TextIOWrapper(file,
884 encoding=encoding,
885 errors="xmlcharrefreplace",
886 newline="\n")
887 # Keep the original file open when the TextIOWrapper is
888 # destroyed
889 stack.callback(file.detach)
890 yield file.write
891
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000892def _namespaces(elem, default_namespace=None):
893 # identify namespaces used in this tree
894
895 # maps qnames to *encoded* prefix:local names
896 qnames = {None: None}
897
898 # maps uri:s to prefixes
899 namespaces = {}
900 if default_namespace:
901 namespaces[default_namespace] = ""
902
903 def add_qname(qname):
904 # calculate serialized qname representation
905 try:
906 if qname[:1] == "{":
907 uri, tag = qname[1:].rsplit("}", 1)
908 prefix = namespaces.get(uri)
909 if prefix is None:
910 prefix = _namespace_map.get(uri)
911 if prefix is None:
912 prefix = "ns%d" % len(namespaces)
913 if prefix != "xml":
914 namespaces[uri] = prefix
915 if prefix:
916 qnames[qname] = "%s:%s" % (prefix, tag)
917 else:
918 qnames[qname] = tag # default element
919 else:
920 if default_namespace:
921 # FIXME: can this be handled in XML 1.0?
922 raise ValueError(
923 "cannot use non-qualified names with "
924 "default_namespace option"
925 )
926 qnames[qname] = qname
927 except TypeError:
928 _raise_serialization_error(qname)
929
930 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300931 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000932 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000933 if isinstance(tag, QName):
934 if tag.text not in qnames:
935 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000936 elif isinstance(tag, str):
937 if tag not in qnames:
938 add_qname(tag)
939 elif tag is not None and tag is not Comment and tag is not PI:
940 _raise_serialization_error(tag)
941 for key, value in elem.items():
942 if isinstance(key, QName):
943 key = key.text
944 if key not in qnames:
945 add_qname(key)
946 if isinstance(value, QName) and value.text not in qnames:
947 add_qname(value.text)
948 text = elem.text
949 if isinstance(text, QName) and text.text not in qnames:
950 add_qname(text.text)
951 return qnames, namespaces
952
953def _serialize_xml(write, elem, qnames, namespaces):
954 tag = elem.tag
955 text = elem.text
956 if tag is Comment:
957 write("<!--%s-->" % text)
958 elif tag is ProcessingInstruction:
959 write("<?%s?>" % text)
960 else:
961 tag = qnames[tag]
962 if tag is None:
963 if text:
964 write(_escape_cdata(text))
965 for e in elem:
966 _serialize_xml(write, e, qnames, None)
967 else:
968 write("<" + tag)
969 items = list(elem.items())
970 if items or namespaces:
971 if namespaces:
972 for v, k in sorted(namespaces.items(),
973 key=lambda x: x[1]): # sort on prefix
974 if k:
975 k = ":" + k
976 write(" xmlns%s=\"%s\"" % (
977 k,
978 _escape_attrib(v)
979 ))
980 for k, v in sorted(items): # lexical order
981 if isinstance(k, QName):
982 k = k.text
983 if isinstance(v, QName):
984 v = qnames[v.text]
985 else:
986 v = _escape_attrib(v)
987 write(" %s=\"%s\"" % (qnames[k], v))
988 if text or len(elem):
989 write(">")
990 if text:
991 write(_escape_cdata(text))
992 for e in elem:
993 _serialize_xml(write, e, qnames, None)
994 write("</" + tag + ">")
995 else:
996 write(" />")
997 if elem.tail:
998 write(_escape_cdata(elem.tail))
999
1000HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +03001001 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001002
1003try:
1004 HTML_EMPTY = set(HTML_EMPTY)
1005except NameError:
1006 pass
1007
1008def _serialize_html(write, elem, qnames, namespaces):
1009 tag = elem.tag
1010 text = elem.text
1011 if tag is Comment:
1012 write("<!--%s-->" % _escape_cdata(text))
1013 elif tag is ProcessingInstruction:
1014 write("<?%s?>" % _escape_cdata(text))
1015 else:
1016 tag = qnames[tag]
1017 if tag is None:
1018 if text:
1019 write(_escape_cdata(text))
1020 for e in elem:
1021 _serialize_html(write, e, qnames, None)
1022 else:
1023 write("<" + tag)
1024 items = list(elem.items())
1025 if items or namespaces:
1026 if namespaces:
1027 for v, k in sorted(namespaces.items(),
1028 key=lambda x: x[1]): # sort on prefix
1029 if k:
1030 k = ":" + k
1031 write(" xmlns%s=\"%s\"" % (
1032 k,
1033 _escape_attrib(v)
1034 ))
1035 for k, v in sorted(items): # lexical order
1036 if isinstance(k, QName):
1037 k = k.text
1038 if isinstance(v, QName):
1039 v = qnames[v.text]
1040 else:
1041 v = _escape_attrib_html(v)
1042 # FIXME: handle boolean attributes
1043 write(" %s=\"%s\"" % (qnames[k], v))
1044 write(">")
1045 tag = tag.lower()
1046 if text:
1047 if tag == "script" or tag == "style":
1048 write(text)
1049 else:
1050 write(_escape_cdata(text))
1051 for e in elem:
1052 _serialize_html(write, e, qnames, None)
1053 if tag not in HTML_EMPTY:
1054 write("</" + tag + ">")
1055 if elem.tail:
1056 write(_escape_cdata(elem.tail))
1057
1058def _serialize_text(write, elem):
1059 for part in elem.itertext():
1060 write(part)
1061 if elem.tail:
1062 write(elem.tail)
1063
1064_serialize = {
1065 "xml": _serialize_xml,
1066 "html": _serialize_html,
1067 "text": _serialize_text,
1068# this optional method is imported at the end of the module
1069# "c14n": _serialize_c14n,
1070}
Armin Rigo9ed73062005-12-14 18:10:45 +00001071
1072##
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001073# Registers a namespace prefix. The registry is global, and any
1074# existing mapping for either the given prefix or the namespace URI
1075# will be removed.
Armin Rigo9ed73062005-12-14 18:10:45 +00001076#
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001077# @param prefix Namespace prefix.
1078# @param uri Namespace uri. Tags and attributes in this namespace
1079# will be serialized with the given prefix, if at all possible.
1080# @exception ValueError If the prefix is reserved, or is otherwise
1081# invalid.
Armin Rigo9ed73062005-12-14 18:10:45 +00001082
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001083def register_namespace(prefix, uri):
1084 if re.match("ns\d+$", prefix):
1085 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001086 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001087 if k == uri or v == prefix:
1088 del _namespace_map[k]
1089 _namespace_map[uri] = prefix
1090
1091_namespace_map = {
1092 # "well-known" namespace prefixes
1093 "http://www.w3.org/XML/1998/namespace": "xml",
1094 "http://www.w3.org/1999/xhtml": "html",
1095 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1096 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1097 # xml schema
1098 "http://www.w3.org/2001/XMLSchema": "xs",
1099 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1100 # dublin core
1101 "http://purl.org/dc/elements/1.1/": "dc",
1102}
Florent Xicluna16395052012-02-16 23:28:35 +01001103# For tests and troubleshooting
1104register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001105
1106def _raise_serialization_error(text):
1107 raise TypeError(
1108 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1109 )
1110
1111def _escape_cdata(text):
1112 # escape character data
1113 try:
1114 # it's worth avoiding do-nothing calls for strings that are
1115 # shorter than 500 character, or so. assume that's, by far,
1116 # the most common case in most applications.
1117 if "&" in text:
1118 text = text.replace("&", "&amp;")
1119 if "<" in text:
1120 text = text.replace("<", "&lt;")
1121 if ">" in text:
1122 text = text.replace(">", "&gt;")
1123 return text
1124 except (TypeError, AttributeError):
1125 _raise_serialization_error(text)
1126
1127def _escape_attrib(text):
1128 # escape attribute value
1129 try:
1130 if "&" in text:
1131 text = text.replace("&", "&amp;")
1132 if "<" in text:
1133 text = text.replace("<", "&lt;")
1134 if ">" in text:
1135 text = text.replace(">", "&gt;")
1136 if "\"" in text:
1137 text = text.replace("\"", "&quot;")
1138 if "\n" in text:
1139 text = text.replace("\n", "&#10;")
1140 return text
1141 except (TypeError, AttributeError):
1142 _raise_serialization_error(text)
1143
1144def _escape_attrib_html(text):
1145 # escape attribute value
1146 try:
1147 if "&" in text:
1148 text = text.replace("&", "&amp;")
1149 if ">" in text:
1150 text = text.replace(">", "&gt;")
1151 if "\"" in text:
1152 text = text.replace("\"", "&quot;")
1153 return text
1154 except (TypeError, AttributeError):
1155 _raise_serialization_error(text)
1156
1157# --------------------------------------------------------------------
1158
1159##
1160# Generates a string representation of an XML element, including all
Florent Xiclunac17f1722010-08-08 19:48:29 +00001161# subelements. If encoding is "unicode", the return type is a string;
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001162# otherwise it is a bytes array.
1163#
1164# @param element An Element instance.
Florent Xiclunac17f1722010-08-08 19:48:29 +00001165# @keyparam encoding Optional output encoding (default is US-ASCII).
1166# Use "unicode" to return a Unicode string.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001167# @keyparam method Optional output method ("xml", "html", "text" or
1168# "c14n"; default is "xml").
1169# @return An (optionally) encoded string containing the XML data.
1170# @defreturn string
1171
1172def tostring(element, encoding=None, method=None):
Eli Bendersky00f402b2012-07-15 06:02:22 +03001173 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
1174 ElementTree(element).write(stream, encoding, method=method)
1175 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001176
1177##
1178# Generates a string representation of an XML element, including all
Eli Bendersky00f402b2012-07-15 06:02:22 +03001179# subelements.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001180#
1181# @param element An Element instance.
1182# @keyparam encoding Optional output encoding (default is US-ASCII).
Florent Xiclunac17f1722010-08-08 19:48:29 +00001183# Use "unicode" to return a Unicode string.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001184# @keyparam method Optional output method ("xml", "html", "text" or
1185# "c14n"; default is "xml").
1186# @return A sequence object containing the XML data.
1187# @defreturn sequence
1188# @since 1.3
1189
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001190class _ListDataStream(io.BufferedIOBase):
1191 """ An auxiliary stream accumulating into a list reference
1192 """
1193 def __init__(self, lst):
1194 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001195
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001196 def writable(self):
1197 return True
1198
1199 def seekable(self):
1200 return True
1201
1202 def write(self, b):
1203 self.lst.append(b)
1204
1205 def tell(self):
1206 return len(self.lst)
1207
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001208def tostringlist(element, encoding=None, method=None):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001209 lst = []
1210 stream = _ListDataStream(lst)
1211 ElementTree(element).write(stream, encoding, method=method)
1212 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001213
1214##
1215# Writes an element tree or element structure to sys.stdout. This
1216# function should be used for debugging only.
1217# <p>
1218# The exact output format is implementation dependent. In this
1219# version, it's written as an ordinary XML file.
1220#
1221# @param elem An element tree or an individual element.
1222
1223def dump(elem):
1224 # debugging
1225 if not isinstance(elem, ElementTree):
1226 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001227 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001228 tail = elem.getroot().tail
1229 if not tail or tail[-1] != "\n":
1230 sys.stdout.write("\n")
1231
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001232# --------------------------------------------------------------------
1233# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001234
1235##
1236# Parses an XML document into an element tree.
1237#
1238# @param source A filename or file object containing XML data.
1239# @param parser An optional parser instance. If not given, the
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001240# standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +00001241# @return An ElementTree instance
1242
1243def parse(source, parser=None):
1244 tree = ElementTree()
1245 tree.parse(source, parser)
1246 return tree
1247
1248##
1249# Parses an XML document into an element tree incrementally, and reports
1250# what's going on to the user.
1251#
1252# @param source A filename or file object containing XML data.
1253# @param events A list of events to report back. If omitted, only "end"
1254# events are reported.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001255# @param parser An optional parser instance. If not given, the
1256# standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +00001257# @return A (event, elem) iterator.
1258
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001259def iterparse(source, events=None, parser=None):
Antoine Pitroue033e062010-10-29 10:38:18 +00001260 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001261 if not hasattr(source, "read"):
1262 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001263 close_source = True
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001264 if not parser:
1265 parser = XMLParser(target=TreeBuilder())
Antoine Pitroue033e062010-10-29 10:38:18 +00001266 return _IterParseIterator(source, events, parser, close_source)
Armin Rigo9ed73062005-12-14 18:10:45 +00001267
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001268class _IterParseIterator:
1269
Antoine Pitroue033e062010-10-29 10:38:18 +00001270 def __init__(self, source, events, parser, close_source=False):
Armin Rigo9ed73062005-12-14 18:10:45 +00001271 self._file = source
Antoine Pitroue033e062010-10-29 10:38:18 +00001272 self._close_file = close_source
Armin Rigo9ed73062005-12-14 18:10:45 +00001273 self._events = []
1274 self._index = 0
Florent Xicluna91d51932011-11-01 23:31:09 +01001275 self._error = None
Armin Rigo9ed73062005-12-14 18:10:45 +00001276 self.root = self._root = None
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001277 self._parser = parser
Armin Rigo9ed73062005-12-14 18:10:45 +00001278 # wire up the parser for event reporting
1279 parser = self._parser._parser
1280 append = self._events.append
1281 if events is None:
1282 events = ["end"]
1283 for event in events:
1284 if event == "start":
1285 try:
1286 parser.ordered_attributes = 1
1287 parser.specified_attributes = 1
1288 def handler(tag, attrib_in, event=event, append=append,
1289 start=self._parser._start_list):
1290 append((event, start(tag, attrib_in)))
1291 parser.StartElementHandler = handler
1292 except AttributeError:
1293 def handler(tag, attrib_in, event=event, append=append,
1294 start=self._parser._start):
1295 append((event, start(tag, attrib_in)))
1296 parser.StartElementHandler = handler
1297 elif event == "end":
1298 def handler(tag, event=event, append=append,
1299 end=self._parser._end):
1300 append((event, end(tag)))
1301 parser.EndElementHandler = handler
1302 elif event == "start-ns":
1303 def handler(prefix, uri, event=event, append=append):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001304 append((event, (prefix or "", uri or "")))
Armin Rigo9ed73062005-12-14 18:10:45 +00001305 parser.StartNamespaceDeclHandler = handler
1306 elif event == "end-ns":
1307 def handler(prefix, event=event, append=append):
1308 append((event, None))
1309 parser.EndNamespaceDeclHandler = handler
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001310 else:
1311 raise ValueError("unknown event %r" % event)
Armin Rigo9ed73062005-12-14 18:10:45 +00001312
Georg Brandla18af4e2007-04-21 15:47:16 +00001313 def __next__(self):
Armin Rigo9ed73062005-12-14 18:10:45 +00001314 while 1:
1315 try:
1316 item = self._events[self._index]
Florent Xicluna91d51932011-11-01 23:31:09 +01001317 self._index += 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001318 return item
Florent Xicluna91d51932011-11-01 23:31:09 +01001319 except IndexError:
1320 pass
1321 if self._error:
1322 e = self._error
1323 self._error = None
1324 raise e
1325 if self._parser is None:
1326 self.root = self._root
1327 if self._close_file:
1328 self._file.close()
1329 raise StopIteration
1330 # load event buffer
1331 del self._events[:]
1332 self._index = 0
1333 data = self._file.read(16384)
1334 if data:
1335 try:
1336 self._parser.feed(data)
1337 except SyntaxError as exc:
1338 self._error = exc
1339 else:
1340 self._root = self._parser.close()
1341 self._parser = None
Armin Rigo9ed73062005-12-14 18:10:45 +00001342
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001343 def __iter__(self):
1344 return self
Armin Rigo9ed73062005-12-14 18:10:45 +00001345
1346##
1347# Parses an XML document from a string constant. This function can
1348# be used to embed "XML literals" in Python code.
1349#
1350# @param source A string containing XML data.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001351# @param parser An optional parser instance. If not given, the
1352# standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +00001353# @return An Element instance.
1354# @defreturn Element
1355
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001356def XML(text, parser=None):
1357 if not parser:
1358 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001359 parser.feed(text)
1360 return parser.close()
1361
1362##
1363# Parses an XML document from a string constant, and also returns
1364# a dictionary which maps from element id:s to elements.
1365#
1366# @param source A string containing XML data.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001367# @param parser An optional parser instance. If not given, the
1368# standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +00001369# @return A tuple containing an Element instance and a dictionary.
1370# @defreturn (Element, dictionary)
1371
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001372def XMLID(text, parser=None):
1373 if not parser:
1374 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001375 parser.feed(text)
1376 tree = parser.close()
1377 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001378 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001379 id = elem.get("id")
1380 if id:
1381 ids[id] = elem
1382 return tree, ids
1383
1384##
1385# Parses an XML document from a string constant. Same as {@link #XML}.
1386#
1387# @def fromstring(text)
1388# @param source A string containing XML data.
1389# @return An Element instance.
1390# @defreturn Element
1391
1392fromstring = XML
1393
1394##
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001395# Parses an XML document from a sequence of string fragments.
Armin Rigo9ed73062005-12-14 18:10:45 +00001396#
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001397# @param sequence A list or other sequence containing XML data fragments.
1398# @param parser An optional parser instance. If not given, the
1399# standard {@link XMLParser} parser is used.
1400# @return An Element instance.
1401# @defreturn Element
1402# @since 1.3
Armin Rigo9ed73062005-12-14 18:10:45 +00001403
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001404def fromstringlist(sequence, parser=None):
1405 if not parser:
1406 parser = XMLParser(target=TreeBuilder())
1407 for text in sequence:
1408 parser.feed(text)
1409 return parser.close()
1410
1411# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001412
1413##
1414# Generic element structure builder. This builder converts a sequence
1415# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1416# #TreeBuilder.end} method calls to a well-formed element structure.
1417# <p>
1418# You can use this class to build an element structure using a custom XML
1419# parser, or a parser for some other XML-like format.
1420#
1421# @param element_factory Optional element factory. This factory
1422# is called to create new Element instances, as necessary.
1423
1424class TreeBuilder:
1425
1426 def __init__(self, element_factory=None):
1427 self._data = [] # data collector
1428 self._elem = [] # element stack
1429 self._last = None # last element
1430 self._tail = None # true if we're after an end tag
1431 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001432 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001433 self._factory = element_factory
1434
1435 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001436 # Flushes the builder buffers, and returns the toplevel document
Armin Rigo9ed73062005-12-14 18:10:45 +00001437 # element.
1438 #
1439 # @return An Element instance.
1440 # @defreturn Element
1441
1442 def close(self):
1443 assert len(self._elem) == 0, "missing end tags"
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001444 assert self._last is not None, "missing toplevel element"
Armin Rigo9ed73062005-12-14 18:10:45 +00001445 return self._last
1446
1447 def _flush(self):
1448 if self._data:
1449 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001450 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001451 if self._tail:
1452 assert self._last.tail is None, "internal error (tail)"
1453 self._last.tail = text
1454 else:
1455 assert self._last.text is None, "internal error (text)"
1456 self._last.text = text
1457 self._data = []
1458
1459 ##
1460 # Adds text to the current element.
1461 #
1462 # @param data A string. This should be either an 8-bit string
1463 # containing ASCII text, or a Unicode string.
1464
1465 def data(self, data):
1466 self._data.append(data)
1467
1468 ##
1469 # Opens a new element.
1470 #
1471 # @param tag The element name.
1472 # @param attrib A dictionary containing element attributes.
1473 # @return The opened element.
1474 # @defreturn Element
1475
1476 def start(self, tag, attrs):
1477 self._flush()
1478 self._last = elem = self._factory(tag, attrs)
1479 if self._elem:
1480 self._elem[-1].append(elem)
1481 self._elem.append(elem)
1482 self._tail = 0
1483 return elem
1484
1485 ##
1486 # Closes the current element.
1487 #
1488 # @param tag The element name.
1489 # @return The closed element.
1490 # @defreturn Element
1491
1492 def end(self, tag):
1493 self._flush()
1494 self._last = self._elem.pop()
1495 assert self._last.tag == tag,\
1496 "end tag mismatch (expected %s, got %s)" % (
1497 self._last.tag, tag)
1498 self._tail = 1
1499 return self._last
1500
1501##
1502# Element structure builder for XML source data, based on the
1503# <b>expat</b> parser.
1504#
1505# @keyparam target Target object. If omitted, the builder uses an
1506# instance of the standard {@link #TreeBuilder} class.
1507# @keyparam html Predefine HTML entities. This flag is not supported
1508# by the current implementation.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001509# @keyparam encoding Optional encoding. If given, the value overrides
1510# the encoding specified in the XML file.
Armin Rigo9ed73062005-12-14 18:10:45 +00001511# @see #ElementTree
1512# @see #TreeBuilder
1513
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001514class XMLParser:
Armin Rigo9ed73062005-12-14 18:10:45 +00001515
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001516 def __init__(self, html=0, target=None, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +00001517 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001518 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001519 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001520 try:
1521 import pyexpat as expat
1522 except ImportError:
1523 raise ImportError(
1524 "No module named expat; use SimpleXMLTreeBuilder instead"
1525 )
1526 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001527 if target is None:
1528 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001529 # underscored names are provided for compatibility only
1530 self.parser = self._parser = parser
1531 self.target = self._target = target
1532 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001533 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001534 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001535 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001536 if hasattr(target, 'start'):
1537 parser.StartElementHandler = self._start
1538 if hasattr(target, 'end'):
1539 parser.EndElementHandler = self._end
1540 if hasattr(target, 'data'):
1541 parser.CharacterDataHandler = target.data
1542 # miscellaneous callbacks
1543 if hasattr(target, 'comment'):
1544 parser.CommentHandler = target.comment
1545 if hasattr(target, 'pi'):
1546 parser.ProcessingInstructionHandler = target.pi
Armin Rigo9ed73062005-12-14 18:10:45 +00001547 # let expat do the buffering, if supported
1548 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001549 parser.buffer_text = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001550 except AttributeError:
1551 pass
1552 # use new-style attribute handling, if supported
1553 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001554 parser.ordered_attributes = 1
1555 parser.specified_attributes = 1
1556 if hasattr(target, 'start'):
1557 parser.StartElementHandler = self._start_list
Armin Rigo9ed73062005-12-14 18:10:45 +00001558 except AttributeError:
1559 pass
Armin Rigo9ed73062005-12-14 18:10:45 +00001560 self._doctype = None
1561 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001562 try:
1563 self.version = "Expat %d.%d.%d" % expat.version_info
1564 except AttributeError:
1565 pass # unknown
1566
1567 def _raiseerror(self, value):
1568 err = ParseError(value)
1569 err.code = value.code
1570 err.position = value.lineno, value.offset
1571 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001572
Armin Rigo9ed73062005-12-14 18:10:45 +00001573 def _fixname(self, key):
1574 # expand qname, and convert name string to ascii, if possible
1575 try:
1576 name = self._names[key]
1577 except KeyError:
1578 name = key
1579 if "}" in name:
1580 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001581 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001582 return name
1583
1584 def _start(self, tag, attrib_in):
1585 fixname = self._fixname
1586 tag = fixname(tag)
1587 attrib = {}
1588 for key, value in attrib_in.items():
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001589 attrib[fixname(key)] = value
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001590 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001591
1592 def _start_list(self, tag, attrib_in):
1593 fixname = self._fixname
1594 tag = fixname(tag)
1595 attrib = {}
1596 if attrib_in:
1597 for i in range(0, len(attrib_in), 2):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001598 attrib[fixname(attrib_in[i])] = attrib_in[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001599 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001600
Armin Rigo9ed73062005-12-14 18:10:45 +00001601 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001602 return self.target.end(self._fixname(tag))
1603
Armin Rigo9ed73062005-12-14 18:10:45 +00001604 def _default(self, text):
1605 prefix = text[:1]
1606 if prefix == "&":
1607 # deal with undefined entities
1608 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001609 data_handler = self.target.data
1610 except AttributeError:
1611 return
1612 try:
1613 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001614 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001615 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001616 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001617 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001618 (text, self.parser.ErrorLineNumber,
1619 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001620 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001621 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001622 err.lineno = self.parser.ErrorLineNumber
1623 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001624 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001625 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1626 self._doctype = [] # inside a doctype declaration
1627 elif self._doctype is not None:
1628 # parse doctype contents
1629 if prefix == ">":
1630 self._doctype = None
1631 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001632 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001633 if not text:
1634 return
1635 self._doctype.append(text)
1636 n = len(self._doctype)
1637 if n > 2:
1638 type = self._doctype[1]
1639 if type == "PUBLIC" and n == 4:
1640 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001641 if pubid:
1642 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001643 elif type == "SYSTEM" and n == 3:
1644 name, type, system = self._doctype
1645 pubid = None
1646 else:
1647 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001648 if hasattr(self.target, "doctype"):
1649 self.target.doctype(name, pubid, system[1:-1])
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001650 elif self.doctype != self._XMLParser__doctype:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001651 # warn about deprecated call
1652 self._XMLParser__doctype(name, pubid, system[1:-1])
1653 self.doctype(name, pubid, system[1:-1])
Armin Rigo9ed73062005-12-14 18:10:45 +00001654 self._doctype = None
1655
1656 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001657 # (Deprecated) Handles a doctype declaration.
Armin Rigo9ed73062005-12-14 18:10:45 +00001658 #
1659 # @param name Doctype name.
1660 # @param pubid Public identifier.
1661 # @param system System identifier.
1662
1663 def doctype(self, name, pubid, system):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001664 """This method of XMLParser is deprecated."""
1665 warnings.warn(
1666 "This method of XMLParser is deprecated. Define doctype() "
1667 "method on the TreeBuilder target.",
1668 DeprecationWarning,
1669 )
1670
1671 # sentinel, if doctype is redefined in a subclass
1672 __doctype = doctype
Armin Rigo9ed73062005-12-14 18:10:45 +00001673
1674 ##
1675 # Feeds data to the parser.
1676 #
1677 # @param data Encoded data.
1678
1679 def feed(self, data):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001680 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001681 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001682 except self._error as v:
1683 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001684
1685 ##
1686 # Finishes feeding data to the parser.
1687 #
1688 # @return An element structure.
1689 # @defreturn Element
1690
1691 def close(self):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001692 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001693 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001694 except self._error as v:
1695 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001696 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001697 close_handler = self.target.close
1698 except AttributeError:
1699 pass
1700 else:
1701 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001702 finally:
1703 # get rid of circular references
1704 del self.parser, self._parser
1705 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001706
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001707
1708# Import the C accelerators
1709try:
1710 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1711 from _elementtree import *
1712except ImportError:
1713 pass
1714else:
1715 # Overwrite 'ElementTree.parse' and 'iterparse' to use the C XMLParser
1716
1717 class ElementTree(ElementTree):
1718 def parse(self, source, parser=None):
1719 close_source = False
1720 if not hasattr(source, 'read'):
1721 source = open(source, 'rb')
1722 close_source = True
1723 try:
1724 if parser is not None:
1725 while True:
1726 data = source.read(65536)
1727 if not data:
1728 break
1729 parser.feed(data)
1730 self._root = parser.close()
1731 else:
1732 parser = XMLParser()
1733 self._root = parser._parse(source)
1734 return self._root
1735 finally:
1736 if close_source:
1737 source.close()
1738
1739 class iterparse:
Eli Benderskyaaa97802013-01-24 07:15:19 -08001740 """Parses an XML section into an element tree incrementally.
1741
1742 Reports what’s going on to the user. 'source' is a filename or file
1743 object containing XML data. 'events' is a list of events to report back.
1744 The supported events are the strings "start", "end", "start-ns" and
1745 "end-ns" (the "ns" events are used to get detailed namespace
1746 information). If 'events' is omitted, only "end" events are reported.
1747 'parser' is an optional parser instance. If not given, the standard
1748 XMLParser parser is used. Returns an iterator providing
1749 (event, elem) pairs.
1750 """
1751
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001752 root = None
Eli Benderskyaaa97802013-01-24 07:15:19 -08001753 def __init__(self, file, events=None, parser=None):
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001754 self._close_file = False
1755 if not hasattr(file, 'read'):
1756 file = open(file, 'rb')
1757 self._close_file = True
1758 self._file = file
1759 self._events = []
1760 self._index = 0
1761 self._error = None
1762 self.root = self._root = None
Eli Benderskyaaa97802013-01-24 07:15:19 -08001763 if parser is None:
1764 parser = XMLParser(target=TreeBuilder())
1765 self._parser = parser
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001766 self._parser._setevents(self._events, events)
1767
1768 def __next__(self):
1769 while True:
1770 try:
1771 item = self._events[self._index]
1772 self._index += 1
1773 return item
1774 except IndexError:
1775 pass
1776 if self._error:
1777 e = self._error
1778 self._error = None
1779 raise e
1780 if self._parser is None:
1781 self.root = self._root
1782 if self._close_file:
1783 self._file.close()
1784 raise StopIteration
1785 # load event buffer
1786 del self._events[:]
1787 self._index = 0
1788 data = self._file.read(16384)
1789 if data:
1790 try:
1791 self._parser.feed(data)
1792 except SyntaxError as exc:
1793 self._error = exc
1794 else:
1795 self._root = self._parser.close()
1796 self._parser = None
1797
1798 def __iter__(self):
1799 return self
1800
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001801# compatibility
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001802XMLTreeBuilder = XMLParser
1803
1804# workaround circular import.
1805try:
1806 from ElementC14N import _serialize_c14n
1807 _serialize["c14n"] = _serialize_c14n
1808except ImportError:
1809 pass