blob: aa5e346a694b0c2d31627801bc081abd5e98d04e [file] [log] [blame]
Armin Rigo9ed73062005-12-14 18:10:45 +00001#
2# ElementTree
Florent Xiclunaf15351d2010-03-13 23:24:31 +00003# $Id: ElementTree.py 3440 2008-07-18 14:45:01Z fredrik $
Armin Rigo9ed73062005-12-14 18:10:45 +00004#
Florent Xiclunaf15351d2010-03-13 23:24:31 +00005# light-weight XML support for Python 2.3 and later.
Armin Rigo9ed73062005-12-14 18:10:45 +00006#
Florent Xiclunaf15351d2010-03-13 23:24:31 +00007# history (since 1.2.6):
8# 2005-11-12 fl added tostringlist/fromstringlist helpers
9# 2006-07-05 fl merged in selected changes from the 1.3 sandbox
10# 2006-07-05 fl removed support for 2.1 and earlier
11# 2007-06-21 fl added deprecation/future warnings
12# 2007-08-25 fl added doctype hook, added parser version attribute etc
13# 2007-08-26 fl added new serializer code (better namespace handling, etc)
14# 2007-08-27 fl warn for broken /tag searches on tree level
15# 2007-09-02 fl added html/text methods to serializer (experimental)
16# 2007-09-05 fl added method argument to tostring/tostringlist
17# 2007-09-06 fl improved error handling
18# 2007-09-13 fl added itertext, iterfind; assorted cleanups
19# 2007-12-15 fl added C14N hooks, copy method (experimental)
Armin Rigo9ed73062005-12-14 18:10:45 +000020#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000021# Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
Armin Rigo9ed73062005-12-14 18:10:45 +000022#
23# fredrik@pythonware.com
24# http://www.pythonware.com
25#
26# --------------------------------------------------------------------
27# The ElementTree toolkit is
28#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000029# Copyright (c) 1999-2008 by Fredrik Lundh
Armin Rigo9ed73062005-12-14 18:10:45 +000030#
31# By obtaining, using, and/or copying this software and/or its
32# associated documentation, you agree that you have read, understood,
33# and will comply with the following terms and conditions:
34#
35# Permission to use, copy, modify, and distribute this software and
36# its associated documentation for any purpose and without fee is
37# hereby granted, provided that the above copyright notice appears in
38# all copies, and that both that copyright notice and this permission
39# notice appear in supporting documentation, and that the name of
40# Secret Labs AB or the author not be used in advertising or publicity
41# pertaining to distribution of the software without specific, written
42# prior permission.
43#
44# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
45# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
46# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
47# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
48# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
49# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
50# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
51# OF THIS SOFTWARE.
52# --------------------------------------------------------------------
53
Fredrik Lundh63168a52005-12-14 22:29:34 +000054# Licensed to PSF under a Contributor Agreement.
Florent Xiclunaf15351d2010-03-13 23:24:31 +000055# See http://www.python.org/psf/license for licensing details.
Fredrik Lundh63168a52005-12-14 22:29:34 +000056
Armin Rigo9ed73062005-12-14 18:10:45 +000057__all__ = [
58 # public symbols
59 "Comment",
60 "dump",
61 "Element", "ElementTree",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000062 "fromstring", "fromstringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000063 "iselement", "iterparse",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000064 "parse", "ParseError",
Armin Rigo9ed73062005-12-14 18:10:45 +000065 "PI", "ProcessingInstruction",
66 "QName",
67 "SubElement",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000068 "tostring", "tostringlist",
Armin Rigo9ed73062005-12-14 18:10:45 +000069 "TreeBuilder",
Florent Xiclunaf15351d2010-03-13 23:24:31 +000070 "VERSION",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010071 "XML", "XMLID",
Thomas Wouters0e3f5912006-08-11 14:57:12 +000072 "XMLParser", "XMLTreeBuilder",
Florent Xiclunaa72a98f2012-02-13 11:03:30 +010073 "register_namespace",
Armin Rigo9ed73062005-12-14 18:10:45 +000074 ]
75
Florent Xiclunaf15351d2010-03-13 23:24:31 +000076VERSION = "1.3.0"
77
Armin Rigo9ed73062005-12-14 18:10:45 +000078##
79# The <b>Element</b> type is a flexible container object, designed to
80# store hierarchical data structures in memory. The type can be
81# described as a cross between a list and a dictionary.
82# <p>
83# Each element has a number of properties associated with it:
84# <ul>
85# <li>a <i>tag</i>. This is a string identifying what kind of data
86# this element represents (the element type, in other words).</li>
87# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
88# <li>a <i>text</i> string.</li>
89# <li>an optional <i>tail</i> string.</li>
90# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
91# </ul>
92#
Florent Xiclunaf15351d2010-03-13 23:24:31 +000093# To create an element instance, use the {@link #Element} constructor
94# or the {@link #SubElement} factory function.
Armin Rigo9ed73062005-12-14 18:10:45 +000095# <p>
96# The {@link #ElementTree} class can be used to wrap an element
97# structure, and convert it from and to XML.
98##
99
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000100import sys
101import re
102import warnings
Eli Bendersky00f402b2012-07-15 06:02:22 +0300103import io
104import contextlib
Armin Rigo9ed73062005-12-14 18:10:45 +0000105
Eli Bendersky27cbb192012-06-15 09:03:19 +0300106from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000107
Armin Rigo9ed73062005-12-14 18:10:45 +0000108
109##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000110# Parser error. This is a subclass of <b>SyntaxError</b>.
Armin Rigo9ed73062005-12-14 18:10:45 +0000111# <p>
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000112# In addition to the exception value, an exception instance contains a
113# specific exception code in the <b>code</b> attribute, and the line and
114# column of the error in the <b>position</b> attribute.
115
116class ParseError(SyntaxError):
117 pass
118
119# --------------------------------------------------------------------
120
121##
122# Checks if an object appears to be a valid element object.
Armin Rigo9ed73062005-12-14 18:10:45 +0000123#
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000124# @param An element instance.
125# @return A true value if this is an element object.
126# @defreturn flag
127
128def iselement(element):
Florent Xiclunaa72a98f2012-02-13 11:03:30 +0100129 # FIXME: not sure about this;
130 # isinstance(element, Element) or look for tag/attrib/text attributes
131 return hasattr(element, 'tag')
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000132
133##
134# Element class. This class defines the Element interface, and
135# provides a reference implementation of this interface.
136# <p>
137# The element name, attribute names, and attribute values can be
138# either ASCII strings (ordinary Python strings containing only 7-bit
139# ASCII characters) or Unicode strings.
140#
141# @param tag The element name.
142# @param attrib An optional dictionary, containing element attributes.
143# @param **extra Additional attributes, given as keyword arguments.
Armin Rigo9ed73062005-12-14 18:10:45 +0000144# @see Element
145# @see SubElement
146# @see Comment
147# @see ProcessingInstruction
148
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000149class Element:
Armin Rigo9ed73062005-12-14 18:10:45 +0000150 # <tag attrib>text<child/>...</tag>tail
151
152 ##
153 # (Attribute) Element tag.
154
155 tag = None
156
157 ##
158 # (Attribute) Element attribute dictionary. Where possible, use
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000159 # {@link #Element.get},
160 # {@link #Element.set},
161 # {@link #Element.keys}, and
162 # {@link #Element.items} to access
Armin Rigo9ed73062005-12-14 18:10:45 +0000163 # element attributes.
164
165 attrib = None
166
167 ##
168 # (Attribute) Text before first subelement. This is either a
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000169 # string or the value None. Note that if there was no text, this
170 # attribute may be either None or an empty string, depending on
171 # the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000172
173 text = None
174
175 ##
176 # (Attribute) Text after this element's end tag, but before the
177 # next sibling element's start tag. This is either a string or
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000178 # the value None. Note that if there was no text, this attribute
179 # may be either None or an empty string, depending on the parser.
Armin Rigo9ed73062005-12-14 18:10:45 +0000180
181 tail = None # text after end tag, if any
182
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000183 # constructor
184
185 def __init__(self, tag, attrib={}, **extra):
Eli Bendersky737b1732012-05-29 06:02:56 +0300186 if not isinstance(attrib, dict):
187 raise TypeError("attrib must be dict, not %s" % (
188 attrib.__class__.__name__,))
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000189 attrib = attrib.copy()
190 attrib.update(extra)
Armin Rigo9ed73062005-12-14 18:10:45 +0000191 self.tag = tag
192 self.attrib = attrib
193 self._children = []
194
195 def __repr__(self):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000196 return "<Element %s at 0x%x>" % (repr(self.tag), id(self))
Armin Rigo9ed73062005-12-14 18:10:45 +0000197
198 ##
199 # Creates a new element object of the same type as this element.
200 #
201 # @param tag Element tag.
202 # @param attrib Element attributes, given as a dictionary.
203 # @return A new element instance.
204
205 def makeelement(self, tag, attrib):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000206 return self.__class__(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +0000207
208 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000209 # (Experimental) Copies the current element. This creates a
210 # shallow copy; subelements will be shared with the original tree.
211 #
212 # @return A new element instance.
213
214 def copy(self):
215 elem = self.makeelement(self.tag, self.attrib)
216 elem.text = self.text
217 elem.tail = self.tail
218 elem[:] = self
219 return elem
220
221 ##
222 # Returns the number of subelements. Note that this only counts
223 # full elements; to check if there's any content in an element, you
224 # have to check both the length and the <b>text</b> attribute.
Armin Rigo9ed73062005-12-14 18:10:45 +0000225 #
226 # @return The number of subelements.
227
228 def __len__(self):
229 return len(self._children)
230
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000231 def __bool__(self):
232 warnings.warn(
233 "The behavior of this method will change in future versions. "
234 "Use specific 'len(elem)' or 'elem is not None' test instead.",
235 FutureWarning, stacklevel=2
236 )
237 return len(self._children) != 0 # emulate old behaviour, for now
238
Armin Rigo9ed73062005-12-14 18:10:45 +0000239 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000240 # Returns the given subelement, by index.
Armin Rigo9ed73062005-12-14 18:10:45 +0000241 #
242 # @param index What subelement to return.
243 # @return The given subelement.
244 # @exception IndexError If the given element does not exist.
245
246 def __getitem__(self, index):
247 return self._children[index]
248
249 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000250 # Replaces the given subelement, by index.
Armin Rigo9ed73062005-12-14 18:10:45 +0000251 #
252 # @param index What subelement to replace.
253 # @param element The new element value.
254 # @exception IndexError If the given element does not exist.
Armin Rigo9ed73062005-12-14 18:10:45 +0000255
256 def __setitem__(self, index, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000257 # if isinstance(index, slice):
258 # for elt in element:
259 # assert iselement(elt)
260 # else:
261 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000262 self._children[index] = element
263
264 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000265 # Deletes the given subelement, by index.
Armin Rigo9ed73062005-12-14 18:10:45 +0000266 #
267 # @param index What subelement to delete.
268 # @exception IndexError If the given element does not exist.
269
270 def __delitem__(self, index):
271 del self._children[index]
272
273 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000274 # Adds a subelement to the end of this element. In document order,
275 # the new element will appear after the last existing subelement (or
276 # directly after the text, if it's the first subelement), but before
277 # the end tag for this element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000278 #
279 # @param element The element to add.
Armin Rigo9ed73062005-12-14 18:10:45 +0000280
281 def append(self, element):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200282 self._assert_is_element(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000283 self._children.append(element)
284
285 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000286 # Appends subelements from a sequence.
287 #
288 # @param elements A sequence object with zero or more elements.
289 # @since 1.3
290
291 def extend(self, elements):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200292 for element in elements:
293 self._assert_is_element(element)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000294 self._children.extend(elements)
295
296 ##
Armin Rigo9ed73062005-12-14 18:10:45 +0000297 # Inserts a subelement at the given position in this element.
298 #
299 # @param index Where to insert the new subelement.
Armin Rigo9ed73062005-12-14 18:10:45 +0000300
301 def insert(self, index, element):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200302 self._assert_is_element(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000303 self._children.insert(index, element)
304
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200305 def _assert_is_element(self, e):
Antoine Pitrouee329312012-10-04 19:53:29 +0200306 # Need to refer to the actual Python implementation, not the
307 # shadowing C implementation.
308 if not isinstance(e, _Element):
Eli Bendersky396e8fc2012-03-23 14:24:20 +0200309 raise TypeError('expected an Element, not %s' % type(e).__name__)
310
Armin Rigo9ed73062005-12-14 18:10:45 +0000311 ##
312 # Removes a matching subelement. Unlike the <b>find</b> methods,
313 # this method compares elements based on identity, not on tag
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000314 # value or contents. To remove subelements by other means, the
315 # easiest way is often to use a list comprehension to select what
316 # elements to keep, and use slice assignment to update the parent
317 # element.
Armin Rigo9ed73062005-12-14 18:10:45 +0000318 #
319 # @param element What element to remove.
320 # @exception ValueError If a matching element could not be found.
Armin Rigo9ed73062005-12-14 18:10:45 +0000321
322 def remove(self, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000323 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000324 self._children.remove(element)
325
326 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000327 # (Deprecated) Returns all subelements. The elements are returned
328 # in document order.
Armin Rigo9ed73062005-12-14 18:10:45 +0000329 #
330 # @return A list of subelements.
331 # @defreturn list of Element instances
332
333 def getchildren(self):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000334 warnings.warn(
335 "This method will be removed in future versions. "
336 "Use 'list(elem)' or iteration over elem instead.",
337 DeprecationWarning, stacklevel=2
338 )
Armin Rigo9ed73062005-12-14 18:10:45 +0000339 return self._children
340
341 ##
342 # Finds the first matching subelement, by tag name or path.
343 #
344 # @param path What element to look for.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000345 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000346 # @return The first matching element, or None if no element was found.
347 # @defreturn Element or None
348
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000349 def find(self, path, namespaces=None):
350 return ElementPath.find(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000351
352 ##
353 # Finds text for the first matching subelement, by tag name or path.
354 #
355 # @param path What element to look for.
356 # @param default What to return if the element was not found.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000357 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000358 # @return The text content of the first matching element, or the
359 # default value no element was found. Note that if the element
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000360 # is found, but has no text content, this method returns an
Armin Rigo9ed73062005-12-14 18:10:45 +0000361 # empty string.
362 # @defreturn string
363
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000364 def findtext(self, path, default=None, namespaces=None):
365 return ElementPath.findtext(self, path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000366
367 ##
368 # Finds all matching subelements, by tag name or path.
369 #
370 # @param path What element to look for.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000371 # @keyparam namespaces Optional namespace prefix map.
372 # @return A list or other sequence containing all matching elements,
Armin Rigo9ed73062005-12-14 18:10:45 +0000373 # in document order.
374 # @defreturn list of Element instances
375
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000376 def findall(self, path, namespaces=None):
377 return ElementPath.findall(self, path, namespaces)
378
379 ##
380 # Finds all matching subelements, by tag name or path.
381 #
382 # @param path What element to look for.
383 # @keyparam namespaces Optional namespace prefix map.
384 # @return An iterator or sequence containing all matching elements,
385 # in document order.
386 # @defreturn a generated sequence of Element instances
387
388 def iterfind(self, path, namespaces=None):
389 return ElementPath.iterfind(self, path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000390
391 ##
392 # Resets an element. This function removes all subelements, clears
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000393 # all attributes, and sets the <b>text</b> and <b>tail</b> attributes
394 # to None.
Armin Rigo9ed73062005-12-14 18:10:45 +0000395
396 def clear(self):
397 self.attrib.clear()
398 self._children = []
399 self.text = self.tail = None
400
401 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000402 # Gets an element attribute. Equivalent to <b>attrib.get</b>, but
403 # some implementations may handle this a bit more efficiently.
Armin Rigo9ed73062005-12-14 18:10:45 +0000404 #
405 # @param key What attribute to look for.
406 # @param default What to return if the attribute was not found.
407 # @return The attribute value, or the default value, if the
408 # attribute was not found.
409 # @defreturn string or None
410
411 def get(self, key, default=None):
412 return self.attrib.get(key, default)
413
414 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000415 # Sets an element attribute. Equivalent to <b>attrib[key] = value</b>,
416 # but some implementations may handle this a bit more efficiently.
Armin Rigo9ed73062005-12-14 18:10:45 +0000417 #
418 # @param key What attribute to set.
419 # @param value The attribute value.
420
421 def set(self, key, value):
422 self.attrib[key] = value
423
424 ##
425 # Gets a list of attribute names. The names are returned in an
426 # arbitrary order (just like for an ordinary Python dictionary).
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000427 # Equivalent to <b>attrib.keys()</b>.
Armin Rigo9ed73062005-12-14 18:10:45 +0000428 #
429 # @return A list of element attribute names.
430 # @defreturn list of strings
431
432 def keys(self):
433 return self.attrib.keys()
434
435 ##
436 # Gets element attributes, as a sequence. The attributes are
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000437 # returned in an arbitrary order. Equivalent to <b>attrib.items()</b>.
Armin Rigo9ed73062005-12-14 18:10:45 +0000438 #
439 # @return A list of (name, value) tuples for all attributes.
440 # @defreturn list of (string, string) tuples
441
442 def items(self):
443 return self.attrib.items()
444
445 ##
446 # Creates a tree iterator. The iterator loops over this element
447 # and all subelements, in document order, and returns all elements
448 # with a matching tag.
449 # <p>
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000450 # If the tree structure is modified during iteration, new or removed
451 # elements may or may not be included. To get a stable set, use the
452 # list() function on the iterator, and loop over the resulting list.
Armin Rigo9ed73062005-12-14 18:10:45 +0000453 #
454 # @param tag What tags to look for (default is to return all elements).
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000455 # @return An iterator containing all the matching elements.
456 # @defreturn iterator
Armin Rigo9ed73062005-12-14 18:10:45 +0000457
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000458 def iter(self, tag=None):
Armin Rigo9ed73062005-12-14 18:10:45 +0000459 if tag == "*":
460 tag = None
461 if tag is None or self.tag == tag:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000462 yield self
463 for e in self._children:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700464 yield from e.iter(tag)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000465
466 # compatibility
467 def getiterator(self, tag=None):
468 # Change for a DeprecationWarning in 1.4
469 warnings.warn(
470 "This method will be removed in future versions. "
471 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
472 PendingDeprecationWarning, stacklevel=2
473 )
474 return list(self.iter(tag))
475
476 ##
477 # Creates a text iterator. The iterator loops over this element
478 # and all subelements, in document order, and returns all inner
479 # text.
480 #
481 # @return An iterator containing all inner text.
482 # @defreturn iterator
483
484 def itertext(self):
485 tag = self.tag
486 if not isinstance(tag, str) and tag is not None:
487 return
488 if self.text:
489 yield self.text
490 for e in self:
Philip Jenveyfd0d3e52012-10-01 15:34:31 -0700491 yield from e.itertext()
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000492 if e.tail:
493 yield e.tail
Armin Rigo9ed73062005-12-14 18:10:45 +0000494
495# compatibility
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000496_Element = _ElementInterface = Element
Armin Rigo9ed73062005-12-14 18:10:45 +0000497
498##
499# Subelement factory. This function creates an element instance, and
500# appends it to an existing element.
501# <p>
502# The element name, attribute names, and attribute values can be
503# either 8-bit ASCII strings or Unicode strings.
504#
505# @param parent The parent element.
506# @param tag The subelement name.
507# @param attrib An optional dictionary, containing element attributes.
508# @param **extra Additional attributes, given as keyword arguments.
509# @return An element instance.
510# @defreturn Element
511
512def SubElement(parent, tag, attrib={}, **extra):
513 attrib = attrib.copy()
514 attrib.update(extra)
515 element = parent.makeelement(tag, attrib)
516 parent.append(element)
517 return element
518
519##
520# Comment element factory. This factory function creates a special
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000521# element that will be serialized as an XML comment by the standard
522# serializer.
Armin Rigo9ed73062005-12-14 18:10:45 +0000523# <p>
524# The comment string can be either an 8-bit ASCII string or a Unicode
525# string.
526#
527# @param text A string containing the comment string.
528# @return An element instance, representing a comment.
529# @defreturn Element
530
531def Comment(text=None):
532 element = Element(Comment)
533 element.text = text
534 return element
535
536##
537# PI element factory. This factory function creates a special element
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000538# that will be serialized as an XML processing instruction by the standard
539# serializer.
Armin Rigo9ed73062005-12-14 18:10:45 +0000540#
541# @param target A string containing the PI target.
542# @param text A string containing the PI contents, if any.
543# @return An element instance, representing a PI.
544# @defreturn Element
545
546def ProcessingInstruction(target, text=None):
547 element = Element(ProcessingInstruction)
548 element.text = target
549 if text:
550 element.text = element.text + " " + text
551 return element
552
553PI = ProcessingInstruction
554
555##
556# QName wrapper. This can be used to wrap a QName attribute value, in
557# order to get proper namespace handling on output.
558#
559# @param text A string containing the QName value, in the form {uri}local,
560# or, if the tag argument is given, the URI part of a QName.
561# @param tag Optional tag. If given, the first argument is interpreted as
562# an URI, and this argument is interpreted as a local name.
563# @return An opaque object, representing the QName.
564
565class QName:
566 def __init__(self, text_or_uri, tag=None):
567 if tag:
568 text_or_uri = "{%s}%s" % (text_or_uri, tag)
569 self.text = text_or_uri
570 def __str__(self):
571 return self.text
Georg Brandlb56c0e22010-12-09 18:10:27 +0000572 def __repr__(self):
Georg Brandlc95c9182010-12-09 18:26:02 +0000573 return '<QName %r>' % (self.text,)
Armin Rigo9ed73062005-12-14 18:10:45 +0000574 def __hash__(self):
575 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000576 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000577 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000578 return self.text <= other.text
579 return self.text <= other
580 def __lt__(self, other):
581 if isinstance(other, QName):
582 return self.text < other.text
583 return self.text < other
584 def __ge__(self, other):
585 if isinstance(other, QName):
586 return self.text >= other.text
587 return self.text >= other
588 def __gt__(self, other):
589 if isinstance(other, QName):
590 return self.text > other.text
591 return self.text > other
592 def __eq__(self, other):
593 if isinstance(other, QName):
594 return self.text == other.text
595 return self.text == other
596 def __ne__(self, other):
597 if isinstance(other, QName):
598 return self.text != other.text
599 return self.text != other
Armin Rigo9ed73062005-12-14 18:10:45 +0000600
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000601# --------------------------------------------------------------------
602
Armin Rigo9ed73062005-12-14 18:10:45 +0000603##
604# ElementTree wrapper class. This class represents an entire element
605# hierarchy, and adds some extra support for serialization to and from
606# standard XML.
607#
608# @param element Optional root element.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000609# @keyparam file Optional file handle or file name. If given, the
Armin Rigo9ed73062005-12-14 18:10:45 +0000610# tree is initialized with the contents of this XML file.
611
612class ElementTree:
613
614 def __init__(self, element=None, file=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000615 # assert element is None or iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000616 self._root = element # first node
617 if file:
618 self.parse(file)
619
620 ##
621 # Gets the root element for this tree.
622 #
623 # @return An element instance.
624 # @defreturn Element
625
626 def getroot(self):
627 return self._root
628
629 ##
630 # Replaces the root element for this tree. This discards the
631 # current contents of the tree, and replaces it with the given
632 # element. Use with care.
633 #
634 # @param element An element instance.
635
636 def _setroot(self, element):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000637 # assert iselement(element)
Armin Rigo9ed73062005-12-14 18:10:45 +0000638 self._root = element
639
640 ##
641 # Loads an external XML document into this element tree.
642 #
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000643 # @param source A file name or file object. If a file object is
644 # given, it only has to implement a <b>read(n)</b> method.
645 # @keyparam parser An optional parser instance. If not given, the
646 # standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +0000647 # @return The document root element.
648 # @defreturn Element
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000649 # @exception ParseError If the parser fails to parse the document.
Armin Rigo9ed73062005-12-14 18:10:45 +0000650
651 def parse(self, source, parser=None):
Antoine Pitroue033e062010-10-29 10:38:18 +0000652 close_source = False
Armin Rigo9ed73062005-12-14 18:10:45 +0000653 if not hasattr(source, "read"):
654 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +0000655 close_source = True
656 try:
657 if not parser:
658 parser = XMLParser(target=TreeBuilder())
659 while 1:
660 data = source.read(65536)
661 if not data:
662 break
663 parser.feed(data)
664 self._root = parser.close()
665 return self._root
666 finally:
667 if close_source:
668 source.close()
Armin Rigo9ed73062005-12-14 18:10:45 +0000669
670 ##
671 # Creates a tree iterator for the root element. The iterator loops
672 # over all elements in this tree, in document order.
673 #
674 # @param tag What tags to look for (default is to return all elements)
675 # @return An iterator.
676 # @defreturn iterator
677
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000678 def iter(self, tag=None):
679 # assert self._root is not None
680 return self._root.iter(tag)
681
682 # compatibility
Armin Rigo9ed73062005-12-14 18:10:45 +0000683 def getiterator(self, tag=None):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000684 # Change for a DeprecationWarning in 1.4
685 warnings.warn(
686 "This method will be removed in future versions. "
687 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
688 PendingDeprecationWarning, stacklevel=2
689 )
690 return list(self.iter(tag))
Armin Rigo9ed73062005-12-14 18:10:45 +0000691
692 ##
693 # Finds the first toplevel element with given tag.
694 # Same as getroot().find(path).
695 #
696 # @param path What element to look for.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000697 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000698 # @return The first matching element, or None if no element was found.
699 # @defreturn Element or None
700
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000701 def find(self, path, namespaces=None):
702 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000703 if path[:1] == "/":
704 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000705 warnings.warn(
706 "This search is broken in 1.3 and earlier, and will be "
707 "fixed in a future version. If you rely on the current "
708 "behaviour, change it to %r" % path,
709 FutureWarning, stacklevel=2
710 )
711 return self._root.find(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000712
713 ##
714 # Finds the element text for the first toplevel element with given
715 # tag. Same as getroot().findtext(path).
716 #
717 # @param path What toplevel element to look for.
718 # @param default What to return if the element was not found.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000719 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000720 # @return The text content of the first matching element, or the
721 # default value no element was found. Note that if the element
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000722 # is found, but has no text content, this method returns an
Armin Rigo9ed73062005-12-14 18:10:45 +0000723 # empty string.
724 # @defreturn string
725
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000726 def findtext(self, path, default=None, namespaces=None):
727 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000728 if path[:1] == "/":
729 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000730 warnings.warn(
731 "This search is broken in 1.3 and earlier, and will be "
732 "fixed in a future version. If you rely on the current "
733 "behaviour, change it to %r" % path,
734 FutureWarning, stacklevel=2
735 )
736 return self._root.findtext(path, default, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000737
738 ##
739 # Finds all toplevel elements with the given tag.
740 # Same as getroot().findall(path).
741 #
742 # @param path What element to look for.
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000743 # @keyparam namespaces Optional namespace prefix map.
Armin Rigo9ed73062005-12-14 18:10:45 +0000744 # @return A list or iterator containing all matching elements,
745 # in document order.
746 # @defreturn list of Element instances
747
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000748 def findall(self, path, namespaces=None):
749 # assert self._root is not None
Armin Rigo9ed73062005-12-14 18:10:45 +0000750 if path[:1] == "/":
751 path = "." + path
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000752 warnings.warn(
753 "This search is broken in 1.3 and earlier, and will be "
754 "fixed in a future version. If you rely on the current "
755 "behaviour, change it to %r" % path,
756 FutureWarning, stacklevel=2
757 )
758 return self._root.findall(path, namespaces)
759
760 ##
761 # Finds all matching subelements, by tag name or path.
762 # Same as getroot().iterfind(path).
763 #
764 # @param path What element to look for.
765 # @keyparam namespaces Optional namespace prefix map.
766 # @return An iterator or sequence containing all matching elements,
767 # in document order.
768 # @defreturn a generated sequence of Element instances
769
770 def iterfind(self, path, namespaces=None):
771 # assert self._root is not None
772 if path[:1] == "/":
773 path = "." + path
774 warnings.warn(
775 "This search is broken in 1.3 and earlier, and will be "
776 "fixed in a future version. If you rely on the current "
777 "behaviour, change it to %r" % path,
778 FutureWarning, stacklevel=2
779 )
780 return self._root.iterfind(path, namespaces)
Armin Rigo9ed73062005-12-14 18:10:45 +0000781
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000782 def write(self, file_or_filename,
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000783 encoding=None,
784 xml_declaration=None,
785 default_namespace=None,
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800786 method=None, *,
787 short_empty_elements=True):
Serhiy Storchaka9e189f02013-01-13 22:24:27 +0200788 """Write the element tree to a file, as XML. 'file_or_filename' is a
789 file name or a file object opened for writing.
790 'encoding' is the output encoding (default is US-ASCII).
791 'xml_declaration' controls if an XML declaration should be added
792 to the output. Use False for never, True for always, None for only
793 if not US-ASCII or UTF-8 or Unicode (default is None).
Eli Benderskye9af8272013-01-13 06:27:51 -0800794 'default_namespace' sets the default XML namespace (for "xmlns").
Serhiy Storchaka9e189f02013-01-13 22:24:27 +0200795 'method' is either "xml" (default), "html", "text" or "c14n".
Eli Benderskye9af8272013-01-13 06:27:51 -0800796 The keyword-only 'short_empty_elements' parameter controls the
Serhiy Storchaka9e189f02013-01-13 22:24:27 +0200797 formatting of elements that contain no content. If True (default),
Eli Benderskye9af8272013-01-13 06:27:51 -0800798 they are emitted as a single self-closed tag, otherwise they are
799 emitted as a pair of start/end tags.
800
801 """
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000802 if not method:
803 method = "xml"
804 elif method not in _serialize:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000805 raise ValueError("unknown method %r" % method)
Florent Xiclunac17f1722010-08-08 19:48:29 +0000806 if not encoding:
807 if method == "c14n":
808 encoding = "utf-8"
809 else:
810 encoding = "us-ascii"
Florent Xiclunac17f1722010-08-08 19:48:29 +0000811 else:
812 encoding = encoding.lower()
Eli Bendersky00f402b2012-07-15 06:02:22 +0300813 with _get_writer(file_or_filename, encoding) as write:
814 if method == "xml" and (xml_declaration or
815 (xml_declaration is None and
816 encoding not in ("utf-8", "us-ascii", "unicode"))):
817 declared_encoding = encoding
818 if encoding == "unicode":
819 # Retrieve the default encoding for the xml declaration
820 import locale
821 declared_encoding = locale.getpreferredencoding()
822 write("<?xml version='1.0' encoding='%s'?>\n" % (
823 declared_encoding,))
824 if method == "text":
825 _serialize_text(write, self._root)
Armin Rigo9ed73062005-12-14 18:10:45 +0000826 else:
Eli Bendersky00f402b2012-07-15 06:02:22 +0300827 qnames, namespaces = _namespaces(self._root, default_namespace)
828 serialize = _serialize[method]
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800829 serialize(write, self._root, qnames, namespaces,
830 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000831
832 def write_c14n(self, file):
833 # lxml.etree compatibility. use output method instead
834 return self.write(file, method="c14n")
Armin Rigo9ed73062005-12-14 18:10:45 +0000835
836# --------------------------------------------------------------------
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000837# serialization support
838
Eli Bendersky00f402b2012-07-15 06:02:22 +0300839@contextlib.contextmanager
840def _get_writer(file_or_filename, encoding):
841 # returns text write method and release all resourses after using
842 try:
843 write = file_or_filename.write
844 except AttributeError:
845 # file_or_filename is a file name
846 if encoding == "unicode":
847 file = open(file_or_filename, "w")
848 else:
849 file = open(file_or_filename, "w", encoding=encoding,
850 errors="xmlcharrefreplace")
851 with file:
852 yield file.write
853 else:
854 # file_or_filename is a file-like object
855 # encoding determines if it is a text or binary writer
856 if encoding == "unicode":
857 # use a text writer as is
858 yield write
859 else:
860 # wrap a binary writer with TextIOWrapper
861 with contextlib.ExitStack() as stack:
862 if isinstance(file_or_filename, io.BufferedIOBase):
863 file = file_or_filename
864 elif isinstance(file_or_filename, io.RawIOBase):
865 file = io.BufferedWriter(file_or_filename)
866 # Keep the original file open when the BufferedWriter is
867 # destroyed
868 stack.callback(file.detach)
869 else:
870 # This is to handle passed objects that aren't in the
871 # IOBase hierarchy, but just have a write method
872 file = io.BufferedIOBase()
873 file.writable = lambda: True
874 file.write = write
875 try:
876 # TextIOWrapper uses this methods to determine
877 # if BOM (for UTF-16, etc) should be added
878 file.seekable = file_or_filename.seekable
879 file.tell = file_or_filename.tell
880 except AttributeError:
881 pass
882 file = io.TextIOWrapper(file,
883 encoding=encoding,
884 errors="xmlcharrefreplace",
885 newline="\n")
886 # Keep the original file open when the TextIOWrapper is
887 # destroyed
888 stack.callback(file.detach)
889 yield file.write
890
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000891def _namespaces(elem, default_namespace=None):
892 # identify namespaces used in this tree
893
894 # maps qnames to *encoded* prefix:local names
895 qnames = {None: None}
896
897 # maps uri:s to prefixes
898 namespaces = {}
899 if default_namespace:
900 namespaces[default_namespace] = ""
901
902 def add_qname(qname):
903 # calculate serialized qname representation
904 try:
905 if qname[:1] == "{":
906 uri, tag = qname[1:].rsplit("}", 1)
907 prefix = namespaces.get(uri)
908 if prefix is None:
909 prefix = _namespace_map.get(uri)
910 if prefix is None:
911 prefix = "ns%d" % len(namespaces)
912 if prefix != "xml":
913 namespaces[uri] = prefix
914 if prefix:
915 qnames[qname] = "%s:%s" % (prefix, tag)
916 else:
917 qnames[qname] = tag # default element
918 else:
919 if default_namespace:
920 # FIXME: can this be handled in XML 1.0?
921 raise ValueError(
922 "cannot use non-qualified names with "
923 "default_namespace option"
924 )
925 qnames[qname] = qname
926 except TypeError:
927 _raise_serialization_error(qname)
928
929 # populate qname and namespaces table
Eli Bendersky64d11e62012-06-15 07:42:50 +0300930 for elem in elem.iter():
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000931 tag = elem.tag
Senthil Kumaranec30b3d2010-11-09 02:36:59 +0000932 if isinstance(tag, QName):
933 if tag.text not in qnames:
934 add_qname(tag.text)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000935 elif isinstance(tag, str):
936 if tag not in qnames:
937 add_qname(tag)
938 elif tag is not None and tag is not Comment and tag is not PI:
939 _raise_serialization_error(tag)
940 for key, value in elem.items():
941 if isinstance(key, QName):
942 key = key.text
943 if key not in qnames:
944 add_qname(key)
945 if isinstance(value, QName) and value.text not in qnames:
946 add_qname(value.text)
947 text = elem.text
948 if isinstance(text, QName) and text.text not in qnames:
949 add_qname(text.text)
950 return qnames, namespaces
951
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800952def _serialize_xml(write, elem, qnames, namespaces,
953 short_empty_elements, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000954 tag = elem.tag
955 text = elem.text
956 if tag is Comment:
957 write("<!--%s-->" % text)
958 elif tag is ProcessingInstruction:
959 write("<?%s?>" % text)
960 else:
961 tag = qnames[tag]
962 if tag is None:
963 if text:
964 write(_escape_cdata(text))
965 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800966 _serialize_xml(write, e, qnames, None,
967 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000968 else:
969 write("<" + tag)
970 items = list(elem.items())
971 if items or namespaces:
972 if namespaces:
973 for v, k in sorted(namespaces.items(),
974 key=lambda x: x[1]): # sort on prefix
975 if k:
976 k = ":" + k
977 write(" xmlns%s=\"%s\"" % (
978 k,
979 _escape_attrib(v)
980 ))
981 for k, v in sorted(items): # lexical order
982 if isinstance(k, QName):
983 k = k.text
984 if isinstance(v, QName):
985 v = qnames[v.text]
986 else:
987 v = _escape_attrib(v)
988 write(" %s=\"%s\"" % (qnames[k], v))
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800989 if text or len(elem) or not short_empty_elements:
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000990 write(">")
991 if text:
992 write(_escape_cdata(text))
993 for e in elem:
Eli Benderskya9a2ef52013-01-13 06:04:43 -0800994 _serialize_xml(write, e, qnames, None,
995 short_empty_elements=short_empty_elements)
Florent Xiclunaf15351d2010-03-13 23:24:31 +0000996 write("</" + tag + ">")
997 else:
998 write(" />")
999 if elem.tail:
1000 write(_escape_cdata(elem.tail))
1001
1002HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
Ezio Melottic90111f2012-09-19 08:19:12 +03001003 "img", "input", "isindex", "link", "meta", "param")
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001004
1005try:
1006 HTML_EMPTY = set(HTML_EMPTY)
1007except NameError:
1008 pass
1009
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001010def _serialize_html(write, elem, qnames, namespaces, **kwargs):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001011 tag = elem.tag
1012 text = elem.text
1013 if tag is Comment:
1014 write("<!--%s-->" % _escape_cdata(text))
1015 elif tag is ProcessingInstruction:
1016 write("<?%s?>" % _escape_cdata(text))
1017 else:
1018 tag = qnames[tag]
1019 if tag is None:
1020 if text:
1021 write(_escape_cdata(text))
1022 for e in elem:
1023 _serialize_html(write, e, qnames, None)
1024 else:
1025 write("<" + tag)
1026 items = list(elem.items())
1027 if items or namespaces:
1028 if namespaces:
1029 for v, k in sorted(namespaces.items(),
1030 key=lambda x: x[1]): # sort on prefix
1031 if k:
1032 k = ":" + k
1033 write(" xmlns%s=\"%s\"" % (
1034 k,
1035 _escape_attrib(v)
1036 ))
1037 for k, v in sorted(items): # lexical order
1038 if isinstance(k, QName):
1039 k = k.text
1040 if isinstance(v, QName):
1041 v = qnames[v.text]
1042 else:
1043 v = _escape_attrib_html(v)
1044 # FIXME: handle boolean attributes
1045 write(" %s=\"%s\"" % (qnames[k], v))
1046 write(">")
1047 tag = tag.lower()
1048 if text:
1049 if tag == "script" or tag == "style":
1050 write(text)
1051 else:
1052 write(_escape_cdata(text))
1053 for e in elem:
1054 _serialize_html(write, e, qnames, None)
1055 if tag not in HTML_EMPTY:
1056 write("</" + tag + ">")
1057 if elem.tail:
1058 write(_escape_cdata(elem.tail))
1059
1060def _serialize_text(write, elem):
1061 for part in elem.itertext():
1062 write(part)
1063 if elem.tail:
1064 write(elem.tail)
1065
1066_serialize = {
1067 "xml": _serialize_xml,
1068 "html": _serialize_html,
1069 "text": _serialize_text,
1070# this optional method is imported at the end of the module
1071# "c14n": _serialize_c14n,
1072}
Armin Rigo9ed73062005-12-14 18:10:45 +00001073
1074##
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001075# Registers a namespace prefix. The registry is global, and any
1076# existing mapping for either the given prefix or the namespace URI
1077# will be removed.
Armin Rigo9ed73062005-12-14 18:10:45 +00001078#
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001079# @param prefix Namespace prefix.
1080# @param uri Namespace uri. Tags and attributes in this namespace
1081# will be serialized with the given prefix, if at all possible.
1082# @exception ValueError If the prefix is reserved, or is otherwise
1083# invalid.
Armin Rigo9ed73062005-12-14 18:10:45 +00001084
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001085def register_namespace(prefix, uri):
1086 if re.match("ns\d+$", prefix):
1087 raise ValueError("Prefix format reserved for internal use")
Georg Brandl90b20672010-12-28 10:38:33 +00001088 for k, v in list(_namespace_map.items()):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001089 if k == uri or v == prefix:
1090 del _namespace_map[k]
1091 _namespace_map[uri] = prefix
1092
1093_namespace_map = {
1094 # "well-known" namespace prefixes
1095 "http://www.w3.org/XML/1998/namespace": "xml",
1096 "http://www.w3.org/1999/xhtml": "html",
1097 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1098 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1099 # xml schema
1100 "http://www.w3.org/2001/XMLSchema": "xs",
1101 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1102 # dublin core
1103 "http://purl.org/dc/elements/1.1/": "dc",
1104}
Florent Xicluna16395052012-02-16 23:28:35 +01001105# For tests and troubleshooting
1106register_namespace._namespace_map = _namespace_map
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001107
1108def _raise_serialization_error(text):
1109 raise TypeError(
1110 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1111 )
1112
1113def _escape_cdata(text):
1114 # escape character data
1115 try:
1116 # it's worth avoiding do-nothing calls for strings that are
1117 # shorter than 500 character, or so. assume that's, by far,
1118 # the most common case in most applications.
1119 if "&" in text:
1120 text = text.replace("&", "&amp;")
1121 if "<" in text:
1122 text = text.replace("<", "&lt;")
1123 if ">" in text:
1124 text = text.replace(">", "&gt;")
1125 return text
1126 except (TypeError, AttributeError):
1127 _raise_serialization_error(text)
1128
1129def _escape_attrib(text):
1130 # escape attribute value
1131 try:
1132 if "&" in text:
1133 text = text.replace("&", "&amp;")
1134 if "<" in text:
1135 text = text.replace("<", "&lt;")
1136 if ">" in text:
1137 text = text.replace(">", "&gt;")
1138 if "\"" in text:
1139 text = text.replace("\"", "&quot;")
1140 if "\n" in text:
1141 text = text.replace("\n", "&#10;")
1142 return text
1143 except (TypeError, AttributeError):
1144 _raise_serialization_error(text)
1145
1146def _escape_attrib_html(text):
1147 # escape attribute value
1148 try:
1149 if "&" in text:
1150 text = text.replace("&", "&amp;")
1151 if ">" in text:
1152 text = text.replace(">", "&gt;")
1153 if "\"" in text:
1154 text = text.replace("\"", "&quot;")
1155 return text
1156 except (TypeError, AttributeError):
1157 _raise_serialization_error(text)
1158
1159# --------------------------------------------------------------------
1160
1161##
1162# Generates a string representation of an XML element, including all
Florent Xiclunac17f1722010-08-08 19:48:29 +00001163# subelements. If encoding is "unicode", the return type is a string;
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001164# otherwise it is a bytes array.
1165#
1166# @param element An Element instance.
Florent Xiclunac17f1722010-08-08 19:48:29 +00001167# @keyparam encoding Optional output encoding (default is US-ASCII).
1168# Use "unicode" to return a Unicode string.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001169# @keyparam method Optional output method ("xml", "html", "text" or
1170# "c14n"; default is "xml").
1171# @return An (optionally) encoded string containing the XML data.
1172# @defreturn string
1173
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001174def tostring(element, encoding=None, method=None, *,
1175 short_empty_elements=True):
Eli Bendersky00f402b2012-07-15 06:02:22 +03001176 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001177 ElementTree(element).write(stream, encoding, method=method,
1178 short_empty_elements=short_empty_elements)
Eli Bendersky00f402b2012-07-15 06:02:22 +03001179 return stream.getvalue()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001180
1181##
1182# Generates a string representation of an XML element, including all
Eli Bendersky00f402b2012-07-15 06:02:22 +03001183# subelements.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001184#
1185# @param element An Element instance.
1186# @keyparam encoding Optional output encoding (default is US-ASCII).
Florent Xiclunac17f1722010-08-08 19:48:29 +00001187# Use "unicode" to return a Unicode string.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001188# @keyparam method Optional output method ("xml", "html", "text" or
1189# "c14n"; default is "xml").
1190# @return A sequence object containing the XML data.
1191# @defreturn sequence
1192# @since 1.3
1193
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001194class _ListDataStream(io.BufferedIOBase):
1195 """ An auxiliary stream accumulating into a list reference
1196 """
1197 def __init__(self, lst):
1198 self.lst = lst
Eli Benderskyf90fc682012-07-17 15:09:56 +03001199
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001200 def writable(self):
1201 return True
1202
1203 def seekable(self):
1204 return True
1205
1206 def write(self, b):
1207 self.lst.append(b)
1208
1209 def tell(self):
1210 return len(self.lst)
1211
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001212def tostringlist(element, encoding=None, method=None, *,
1213 short_empty_elements=True):
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001214 lst = []
1215 stream = _ListDataStream(lst)
Eli Benderskya9a2ef52013-01-13 06:04:43 -08001216 ElementTree(element).write(stream, encoding, method=method,
1217 short_empty_elements=short_empty_elements)
Eli Bendersky43cc5f22012-07-17 15:09:12 +03001218 return lst
Armin Rigo9ed73062005-12-14 18:10:45 +00001219
1220##
1221# Writes an element tree or element structure to sys.stdout. This
1222# function should be used for debugging only.
1223# <p>
1224# The exact output format is implementation dependent. In this
1225# version, it's written as an ordinary XML file.
1226#
1227# @param elem An element tree or an individual element.
1228
1229def dump(elem):
1230 # debugging
1231 if not isinstance(elem, ElementTree):
1232 elem = ElementTree(elem)
Florent Xiclunac17f1722010-08-08 19:48:29 +00001233 elem.write(sys.stdout, encoding="unicode")
Armin Rigo9ed73062005-12-14 18:10:45 +00001234 tail = elem.getroot().tail
1235 if not tail or tail[-1] != "\n":
1236 sys.stdout.write("\n")
1237
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001238# --------------------------------------------------------------------
1239# parsing
Armin Rigo9ed73062005-12-14 18:10:45 +00001240
1241##
1242# Parses an XML document into an element tree.
1243#
1244# @param source A filename or file object containing XML data.
1245# @param parser An optional parser instance. If not given, the
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001246# standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +00001247# @return An ElementTree instance
1248
1249def parse(source, parser=None):
1250 tree = ElementTree()
1251 tree.parse(source, parser)
1252 return tree
1253
1254##
1255# Parses an XML document into an element tree incrementally, and reports
1256# what's going on to the user.
1257#
1258# @param source A filename or file object containing XML data.
1259# @param events A list of events to report back. If omitted, only "end"
1260# events are reported.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001261# @param parser An optional parser instance. If not given, the
1262# standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +00001263# @return A (event, elem) iterator.
1264
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001265def iterparse(source, events=None, parser=None):
Antoine Pitroue033e062010-10-29 10:38:18 +00001266 close_source = False
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001267 if not hasattr(source, "read"):
1268 source = open(source, "rb")
Antoine Pitroue033e062010-10-29 10:38:18 +00001269 close_source = True
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001270 if not parser:
1271 parser = XMLParser(target=TreeBuilder())
Antoine Pitroue033e062010-10-29 10:38:18 +00001272 return _IterParseIterator(source, events, parser, close_source)
Armin Rigo9ed73062005-12-14 18:10:45 +00001273
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001274class _IterParseIterator:
1275
Antoine Pitroue033e062010-10-29 10:38:18 +00001276 def __init__(self, source, events, parser, close_source=False):
Armin Rigo9ed73062005-12-14 18:10:45 +00001277 self._file = source
Antoine Pitroue033e062010-10-29 10:38:18 +00001278 self._close_file = close_source
Armin Rigo9ed73062005-12-14 18:10:45 +00001279 self._events = []
1280 self._index = 0
Florent Xicluna91d51932011-11-01 23:31:09 +01001281 self._error = None
Armin Rigo9ed73062005-12-14 18:10:45 +00001282 self.root = self._root = None
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001283 self._parser = parser
Armin Rigo9ed73062005-12-14 18:10:45 +00001284 # wire up the parser for event reporting
1285 parser = self._parser._parser
1286 append = self._events.append
1287 if events is None:
1288 events = ["end"]
1289 for event in events:
1290 if event == "start":
1291 try:
1292 parser.ordered_attributes = 1
1293 parser.specified_attributes = 1
1294 def handler(tag, attrib_in, event=event, append=append,
1295 start=self._parser._start_list):
1296 append((event, start(tag, attrib_in)))
1297 parser.StartElementHandler = handler
1298 except AttributeError:
1299 def handler(tag, attrib_in, event=event, append=append,
1300 start=self._parser._start):
1301 append((event, start(tag, attrib_in)))
1302 parser.StartElementHandler = handler
1303 elif event == "end":
1304 def handler(tag, event=event, append=append,
1305 end=self._parser._end):
1306 append((event, end(tag)))
1307 parser.EndElementHandler = handler
1308 elif event == "start-ns":
1309 def handler(prefix, uri, event=event, append=append):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001310 append((event, (prefix or "", uri or "")))
Armin Rigo9ed73062005-12-14 18:10:45 +00001311 parser.StartNamespaceDeclHandler = handler
1312 elif event == "end-ns":
1313 def handler(prefix, event=event, append=append):
1314 append((event, None))
1315 parser.EndNamespaceDeclHandler = handler
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001316 else:
1317 raise ValueError("unknown event %r" % event)
Armin Rigo9ed73062005-12-14 18:10:45 +00001318
Georg Brandla18af4e2007-04-21 15:47:16 +00001319 def __next__(self):
Armin Rigo9ed73062005-12-14 18:10:45 +00001320 while 1:
1321 try:
1322 item = self._events[self._index]
Florent Xicluna91d51932011-11-01 23:31:09 +01001323 self._index += 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001324 return item
Florent Xicluna91d51932011-11-01 23:31:09 +01001325 except IndexError:
1326 pass
1327 if self._error:
1328 e = self._error
1329 self._error = None
1330 raise e
1331 if self._parser is None:
1332 self.root = self._root
1333 if self._close_file:
1334 self._file.close()
1335 raise StopIteration
1336 # load event buffer
1337 del self._events[:]
1338 self._index = 0
1339 data = self._file.read(16384)
1340 if data:
1341 try:
1342 self._parser.feed(data)
1343 except SyntaxError as exc:
1344 self._error = exc
1345 else:
1346 self._root = self._parser.close()
1347 self._parser = None
Armin Rigo9ed73062005-12-14 18:10:45 +00001348
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001349 def __iter__(self):
1350 return self
Armin Rigo9ed73062005-12-14 18:10:45 +00001351
1352##
1353# Parses an XML document from a string constant. This function can
1354# be used to embed "XML literals" in Python code.
1355#
1356# @param source A string containing XML data.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001357# @param parser An optional parser instance. If not given, the
1358# standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +00001359# @return An Element instance.
1360# @defreturn Element
1361
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001362def XML(text, parser=None):
1363 if not parser:
1364 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001365 parser.feed(text)
1366 return parser.close()
1367
1368##
1369# Parses an XML document from a string constant, and also returns
1370# a dictionary which maps from element id:s to elements.
1371#
1372# @param source A string containing XML data.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001373# @param parser An optional parser instance. If not given, the
1374# standard {@link XMLParser} parser is used.
Armin Rigo9ed73062005-12-14 18:10:45 +00001375# @return A tuple containing an Element instance and a dictionary.
1376# @defreturn (Element, dictionary)
1377
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001378def XMLID(text, parser=None):
1379 if not parser:
1380 parser = XMLParser(target=TreeBuilder())
Armin Rigo9ed73062005-12-14 18:10:45 +00001381 parser.feed(text)
1382 tree = parser.close()
1383 ids = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001384 for elem in tree.iter():
Armin Rigo9ed73062005-12-14 18:10:45 +00001385 id = elem.get("id")
1386 if id:
1387 ids[id] = elem
1388 return tree, ids
1389
1390##
1391# Parses an XML document from a string constant. Same as {@link #XML}.
1392#
1393# @def fromstring(text)
1394# @param source A string containing XML data.
1395# @return An Element instance.
1396# @defreturn Element
1397
1398fromstring = XML
1399
1400##
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001401# Parses an XML document from a sequence of string fragments.
Armin Rigo9ed73062005-12-14 18:10:45 +00001402#
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001403# @param sequence A list or other sequence containing XML data fragments.
1404# @param parser An optional parser instance. If not given, the
1405# standard {@link XMLParser} parser is used.
1406# @return An Element instance.
1407# @defreturn Element
1408# @since 1.3
Armin Rigo9ed73062005-12-14 18:10:45 +00001409
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001410def fromstringlist(sequence, parser=None):
1411 if not parser:
1412 parser = XMLParser(target=TreeBuilder())
1413 for text in sequence:
1414 parser.feed(text)
1415 return parser.close()
1416
1417# --------------------------------------------------------------------
Armin Rigo9ed73062005-12-14 18:10:45 +00001418
1419##
1420# Generic element structure builder. This builder converts a sequence
1421# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1422# #TreeBuilder.end} method calls to a well-formed element structure.
1423# <p>
1424# You can use this class to build an element structure using a custom XML
1425# parser, or a parser for some other XML-like format.
1426#
1427# @param element_factory Optional element factory. This factory
1428# is called to create new Element instances, as necessary.
1429
1430class TreeBuilder:
1431
1432 def __init__(self, element_factory=None):
1433 self._data = [] # data collector
1434 self._elem = [] # element stack
1435 self._last = None # last element
1436 self._tail = None # true if we're after an end tag
1437 if element_factory is None:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001438 element_factory = Element
Armin Rigo9ed73062005-12-14 18:10:45 +00001439 self._factory = element_factory
1440
1441 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001442 # Flushes the builder buffers, and returns the toplevel document
Armin Rigo9ed73062005-12-14 18:10:45 +00001443 # element.
1444 #
1445 # @return An Element instance.
1446 # @defreturn Element
1447
1448 def close(self):
1449 assert len(self._elem) == 0, "missing end tags"
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001450 assert self._last is not None, "missing toplevel element"
Armin Rigo9ed73062005-12-14 18:10:45 +00001451 return self._last
1452
1453 def _flush(self):
1454 if self._data:
1455 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001456 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001457 if self._tail:
1458 assert self._last.tail is None, "internal error (tail)"
1459 self._last.tail = text
1460 else:
1461 assert self._last.text is None, "internal error (text)"
1462 self._last.text = text
1463 self._data = []
1464
1465 ##
1466 # Adds text to the current element.
1467 #
1468 # @param data A string. This should be either an 8-bit string
1469 # containing ASCII text, or a Unicode string.
1470
1471 def data(self, data):
1472 self._data.append(data)
1473
1474 ##
1475 # Opens a new element.
1476 #
1477 # @param tag The element name.
1478 # @param attrib A dictionary containing element attributes.
1479 # @return The opened element.
1480 # @defreturn Element
1481
1482 def start(self, tag, attrs):
1483 self._flush()
1484 self._last = elem = self._factory(tag, attrs)
1485 if self._elem:
1486 self._elem[-1].append(elem)
1487 self._elem.append(elem)
1488 self._tail = 0
1489 return elem
1490
1491 ##
1492 # Closes the current element.
1493 #
1494 # @param tag The element name.
1495 # @return The closed element.
1496 # @defreturn Element
1497
1498 def end(self, tag):
1499 self._flush()
1500 self._last = self._elem.pop()
1501 assert self._last.tag == tag,\
1502 "end tag mismatch (expected %s, got %s)" % (
1503 self._last.tag, tag)
1504 self._tail = 1
1505 return self._last
1506
1507##
1508# Element structure builder for XML source data, based on the
1509# <b>expat</b> parser.
1510#
1511# @keyparam target Target object. If omitted, the builder uses an
1512# instance of the standard {@link #TreeBuilder} class.
1513# @keyparam html Predefine HTML entities. This flag is not supported
1514# by the current implementation.
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001515# @keyparam encoding Optional encoding. If given, the value overrides
1516# the encoding specified in the XML file.
Armin Rigo9ed73062005-12-14 18:10:45 +00001517# @see #ElementTree
1518# @see #TreeBuilder
1519
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001520class XMLParser:
Armin Rigo9ed73062005-12-14 18:10:45 +00001521
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001522 def __init__(self, html=0, target=None, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +00001523 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001524 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001525 except ImportError:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001526 try:
1527 import pyexpat as expat
1528 except ImportError:
1529 raise ImportError(
1530 "No module named expat; use SimpleXMLTreeBuilder instead"
1531 )
1532 parser = expat.ParserCreate(encoding, "}")
Armin Rigo9ed73062005-12-14 18:10:45 +00001533 if target is None:
1534 target = TreeBuilder()
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001535 # underscored names are provided for compatibility only
1536 self.parser = self._parser = parser
1537 self.target = self._target = target
1538 self._error = expat.error
Armin Rigo9ed73062005-12-14 18:10:45 +00001539 self._names = {} # name memo cache
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001540 # main callbacks
Armin Rigo9ed73062005-12-14 18:10:45 +00001541 parser.DefaultHandlerExpand = self._default
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001542 if hasattr(target, 'start'):
1543 parser.StartElementHandler = self._start
1544 if hasattr(target, 'end'):
1545 parser.EndElementHandler = self._end
1546 if hasattr(target, 'data'):
1547 parser.CharacterDataHandler = target.data
1548 # miscellaneous callbacks
1549 if hasattr(target, 'comment'):
1550 parser.CommentHandler = target.comment
1551 if hasattr(target, 'pi'):
1552 parser.ProcessingInstructionHandler = target.pi
Armin Rigo9ed73062005-12-14 18:10:45 +00001553 # let expat do the buffering, if supported
1554 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001555 parser.buffer_text = 1
Armin Rigo9ed73062005-12-14 18:10:45 +00001556 except AttributeError:
1557 pass
1558 # use new-style attribute handling, if supported
1559 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001560 parser.ordered_attributes = 1
1561 parser.specified_attributes = 1
1562 if hasattr(target, 'start'):
1563 parser.StartElementHandler = self._start_list
Armin Rigo9ed73062005-12-14 18:10:45 +00001564 except AttributeError:
1565 pass
Armin Rigo9ed73062005-12-14 18:10:45 +00001566 self._doctype = None
1567 self.entity = {}
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001568 try:
1569 self.version = "Expat %d.%d.%d" % expat.version_info
1570 except AttributeError:
1571 pass # unknown
1572
1573 def _raiseerror(self, value):
1574 err = ParseError(value)
1575 err.code = value.code
1576 err.position = value.lineno, value.offset
1577 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001578
Armin Rigo9ed73062005-12-14 18:10:45 +00001579 def _fixname(self, key):
1580 # expand qname, and convert name string to ascii, if possible
1581 try:
1582 name = self._names[key]
1583 except KeyError:
1584 name = key
1585 if "}" in name:
1586 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001587 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001588 return name
1589
1590 def _start(self, tag, attrib_in):
1591 fixname = self._fixname
1592 tag = fixname(tag)
1593 attrib = {}
1594 for key, value in attrib_in.items():
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001595 attrib[fixname(key)] = value
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001596 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001597
1598 def _start_list(self, tag, attrib_in):
1599 fixname = self._fixname
1600 tag = fixname(tag)
1601 attrib = {}
1602 if attrib_in:
1603 for i in range(0, len(attrib_in), 2):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001604 attrib[fixname(attrib_in[i])] = attrib_in[i+1]
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001605 return self.target.start(tag, attrib)
Armin Rigo9ed73062005-12-14 18:10:45 +00001606
Armin Rigo9ed73062005-12-14 18:10:45 +00001607 def _end(self, tag):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001608 return self.target.end(self._fixname(tag))
1609
Armin Rigo9ed73062005-12-14 18:10:45 +00001610 def _default(self, text):
1611 prefix = text[:1]
1612 if prefix == "&":
1613 # deal with undefined entities
1614 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001615 data_handler = self.target.data
1616 except AttributeError:
1617 return
1618 try:
1619 data_handler(self.entity[text[1:-1]])
Armin Rigo9ed73062005-12-14 18:10:45 +00001620 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001621 from xml.parsers import expat
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001622 err = expat.error(
Armin Rigo9ed73062005-12-14 18:10:45 +00001623 "undefined entity %s: line %d, column %d" %
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001624 (text, self.parser.ErrorLineNumber,
1625 self.parser.ErrorColumnNumber)
Armin Rigo9ed73062005-12-14 18:10:45 +00001626 )
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001627 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001628 err.lineno = self.parser.ErrorLineNumber
1629 err.offset = self.parser.ErrorColumnNumber
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001630 raise err
Armin Rigo9ed73062005-12-14 18:10:45 +00001631 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1632 self._doctype = [] # inside a doctype declaration
1633 elif self._doctype is not None:
1634 # parse doctype contents
1635 if prefix == ">":
1636 self._doctype = None
1637 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001638 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001639 if not text:
1640 return
1641 self._doctype.append(text)
1642 n = len(self._doctype)
1643 if n > 2:
1644 type = self._doctype[1]
1645 if type == "PUBLIC" and n == 4:
1646 name, type, pubid, system = self._doctype
Florent Xiclunaa1c974a2012-07-07 13:16:44 +02001647 if pubid:
1648 pubid = pubid[1:-1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001649 elif type == "SYSTEM" and n == 3:
1650 name, type, system = self._doctype
1651 pubid = None
1652 else:
1653 return
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001654 if hasattr(self.target, "doctype"):
1655 self.target.doctype(name, pubid, system[1:-1])
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001656 elif self.doctype != self._XMLParser__doctype:
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001657 # warn about deprecated call
1658 self._XMLParser__doctype(name, pubid, system[1:-1])
1659 self.doctype(name, pubid, system[1:-1])
Armin Rigo9ed73062005-12-14 18:10:45 +00001660 self._doctype = None
1661
1662 ##
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001663 # (Deprecated) Handles a doctype declaration.
Armin Rigo9ed73062005-12-14 18:10:45 +00001664 #
1665 # @param name Doctype name.
1666 # @param pubid Public identifier.
1667 # @param system System identifier.
1668
1669 def doctype(self, name, pubid, system):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001670 """This method of XMLParser is deprecated."""
1671 warnings.warn(
1672 "This method of XMLParser is deprecated. Define doctype() "
1673 "method on the TreeBuilder target.",
1674 DeprecationWarning,
1675 )
1676
1677 # sentinel, if doctype is redefined in a subclass
1678 __doctype = doctype
Armin Rigo9ed73062005-12-14 18:10:45 +00001679
1680 ##
1681 # Feeds data to the parser.
1682 #
1683 # @param data Encoded data.
1684
1685 def feed(self, data):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001686 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001687 self.parser.Parse(data, 0)
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001688 except self._error as v:
1689 self._raiseerror(v)
Armin Rigo9ed73062005-12-14 18:10:45 +00001690
1691 ##
1692 # Finishes feeding data to the parser.
1693 #
1694 # @return An element structure.
1695 # @defreturn Element
1696
1697 def close(self):
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001698 try:
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001699 self.parser.Parse("", 1) # end of data
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001700 except self._error as v:
1701 self._raiseerror(v)
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001702 try:
Florent Xiclunafb067462012-03-05 11:42:49 +01001703 close_handler = self.target.close
1704 except AttributeError:
1705 pass
1706 else:
1707 return close_handler()
Florent Xicluna75b5e7e2012-03-05 10:42:19 +01001708 finally:
1709 # get rid of circular references
1710 del self.parser, self._parser
1711 del self.target, self._target
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001712
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001713
1714# Import the C accelerators
1715try:
1716 # Element, SubElement, ParseError, TreeBuilder, XMLParser
1717 from _elementtree import *
1718except ImportError:
1719 pass
1720else:
1721 # Overwrite 'ElementTree.parse' and 'iterparse' to use the C XMLParser
1722
1723 class ElementTree(ElementTree):
1724 def parse(self, source, parser=None):
1725 close_source = False
1726 if not hasattr(source, 'read'):
1727 source = open(source, 'rb')
1728 close_source = True
1729 try:
1730 if parser is not None:
1731 while True:
1732 data = source.read(65536)
1733 if not data:
1734 break
1735 parser.feed(data)
1736 self._root = parser.close()
1737 else:
1738 parser = XMLParser()
1739 self._root = parser._parse(source)
1740 return self._root
1741 finally:
1742 if close_source:
1743 source.close()
1744
1745 class iterparse:
Eli Benderskyaaa97802013-01-24 07:15:19 -08001746 """Parses an XML section into an element tree incrementally.
1747
1748 Reports what’s going on to the user. 'source' is a filename or file
1749 object containing XML data. 'events' is a list of events to report back.
1750 The supported events are the strings "start", "end", "start-ns" and
1751 "end-ns" (the "ns" events are used to get detailed namespace
1752 information). If 'events' is omitted, only "end" events are reported.
1753 'parser' is an optional parser instance. If not given, the standard
1754 XMLParser parser is used. Returns an iterator providing
1755 (event, elem) pairs.
1756 """
1757
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001758 root = None
Eli Benderskyaaa97802013-01-24 07:15:19 -08001759 def __init__(self, file, events=None, parser=None):
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001760 self._close_file = False
1761 if not hasattr(file, 'read'):
1762 file = open(file, 'rb')
1763 self._close_file = True
1764 self._file = file
1765 self._events = []
1766 self._index = 0
1767 self._error = None
1768 self.root = self._root = None
Eli Benderskyaaa97802013-01-24 07:15:19 -08001769 if parser is None:
1770 parser = XMLParser(target=TreeBuilder())
1771 self._parser = parser
Florent Xiclunaa72a98f2012-02-13 11:03:30 +01001772 self._parser._setevents(self._events, events)
1773
1774 def __next__(self):
1775 while True:
1776 try:
1777 item = self._events[self._index]
1778 self._index += 1
1779 return item
1780 except IndexError:
1781 pass
1782 if self._error:
1783 e = self._error
1784 self._error = None
1785 raise e
1786 if self._parser is None:
1787 self.root = self._root
1788 if self._close_file:
1789 self._file.close()
1790 raise StopIteration
1791 # load event buffer
1792 del self._events[:]
1793 self._index = 0
1794 data = self._file.read(16384)
1795 if data:
1796 try:
1797 self._parser.feed(data)
1798 except SyntaxError as exc:
1799 self._error = exc
1800 else:
1801 self._root = self._parser.close()
1802 self._parser = None
1803
1804 def __iter__(self):
1805 return self
1806
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001807# compatibility
Florent Xiclunaf15351d2010-03-13 23:24:31 +00001808XMLTreeBuilder = XMLParser
1809
1810# workaround circular import.
1811try:
1812 from ElementC14N import _serialize_c14n
1813 _serialize["c14n"] = _serialize_c14n
1814except ImportError:
1815 pass