blob: 782af81b50a605ce676b8f7d5b8b3619cd148272 [file] [log] [blame]
Armin Rigo9ed73062005-12-14 18:10:45 +00001#
2# ElementTree
3# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
4#
5# light-weight XML support for Python 1.5.2 and later.
6#
7# history:
8# 2001-10-20 fl created (from various sources)
9# 2001-11-01 fl return root from parse method
10# 2002-02-16 fl sort attributes in lexical order
11# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
12# 2002-05-01 fl finished TreeBuilder refactoring
13# 2002-07-14 fl added basic namespace support to ElementTree.write
14# 2002-07-25 fl added QName attribute support
15# 2002-10-20 fl fixed encoding in write
16# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
17# 2002-11-27 fl accept file objects or file names for parse/write
18# 2002-12-04 fl moved XMLTreeBuilder back to this module
19# 2003-01-11 fl fixed entity encoding glitch for us-ascii
20# 2003-02-13 fl added XML literal factory
21# 2003-02-21 fl added ProcessingInstruction/PI factory
22# 2003-05-11 fl added tostring/fromstring helpers
23# 2003-05-26 fl added ElementPath support
24# 2003-07-05 fl added makeelement factory method
25# 2003-07-28 fl added more well-known namespace prefixes
26# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
27# 2003-09-04 fl fall back on emulator if ElementPath is not installed
28# 2003-10-31 fl markup updates
29# 2003-11-15 fl fixed nested namespace bug
30# 2004-03-28 fl added XMLID helper
31# 2004-06-02 fl added default support to findtext
32# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
33# 2004-08-23 fl take advantage of post-2.1 expat features
34# 2005-02-01 fl added iterparse implementation
35# 2005-03-02 fl fixed iterparse support for pre-2.2 versions
36#
37# Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved.
38#
39# fredrik@pythonware.com
40# http://www.pythonware.com
41#
42# --------------------------------------------------------------------
43# The ElementTree toolkit is
44#
45# Copyright (c) 1999-2005 by Fredrik Lundh
46#
47# By obtaining, using, and/or copying this software and/or its
48# associated documentation, you agree that you have read, understood,
49# and will comply with the following terms and conditions:
50#
51# Permission to use, copy, modify, and distribute this software and
52# its associated documentation for any purpose and without fee is
53# hereby granted, provided that the above copyright notice appears in
54# all copies, and that both that copyright notice and this permission
55# notice appear in supporting documentation, and that the name of
56# Secret Labs AB or the author not be used in advertising or publicity
57# pertaining to distribution of the software without specific, written
58# prior permission.
59#
60# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
61# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
62# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
63# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
64# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
65# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
66# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
67# OF THIS SOFTWARE.
68# --------------------------------------------------------------------
69
Fredrik Lundh63168a52005-12-14 22:29:34 +000070# Licensed to PSF under a Contributor Agreement.
71# See http://www.python.org/2.4/license for licensing details.
72
Armin Rigo9ed73062005-12-14 18:10:45 +000073__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
78 "fromstring",
79 "iselement", "iterparse",
80 "parse",
81 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
84 "tostring",
85 "TreeBuilder",
86 "VERSION", "XML",
Thomas Wouters0e3f5912006-08-11 14:57:12 +000087 "XMLParser", "XMLTreeBuilder",
Armin Rigo9ed73062005-12-14 18:10:45 +000088 ]
89
90##
91# The <b>Element</b> type is a flexible container object, designed to
92# store hierarchical data structures in memory. The type can be
93# described as a cross between a list and a dictionary.
94# <p>
95# Each element has a number of properties associated with it:
96# <ul>
97# <li>a <i>tag</i>. This is a string identifying what kind of data
98# this element represents (the element type, in other words).</li>
99# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
100# <li>a <i>text</i> string.</li>
101# <li>an optional <i>tail</i> string.</li>
102# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
103# </ul>
104#
105# To create an element instance, use the {@link #Element} or {@link
106# #SubElement} factory functions.
107# <p>
108# The {@link #ElementTree} class can be used to wrap an element
109# structure, and convert it from and to XML.
110##
111
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000112import sys, re
Armin Rigo9ed73062005-12-14 18:10:45 +0000113
Alex Martelli6cefeb02006-08-21 23:45:19 +0000114from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000115
116# TODO: add support for custom namespace resolvers/default namespaces
117# TODO: add improved support for incremental parsing
118
119VERSION = "1.2.6"
120
121##
122# Internal element class. This class defines the Element interface,
123# and provides a reference implementation of this interface.
124# <p>
125# You should not create instances of this class directly. Use the
126# appropriate factory functions instead, such as {@link #Element}
127# and {@link #SubElement}.
128#
129# @see Element
130# @see SubElement
131# @see Comment
132# @see ProcessingInstruction
133
134class _ElementInterface:
135 # <tag attrib>text<child/>...</tag>tail
136
137 ##
138 # (Attribute) Element tag.
139
140 tag = None
141
142 ##
143 # (Attribute) Element attribute dictionary. Where possible, use
144 # {@link #_ElementInterface.get},
145 # {@link #_ElementInterface.set},
146 # {@link #_ElementInterface.keys}, and
147 # {@link #_ElementInterface.items} to access
148 # element attributes.
149
150 attrib = None
151
152 ##
153 # (Attribute) Text before first subelement. This is either a
154 # string or the value None, if there was no text.
155
156 text = None
157
158 ##
159 # (Attribute) Text after this element's end tag, but before the
160 # next sibling element's start tag. This is either a string or
161 # the value None, if there was no text.
162
163 tail = None # text after end tag, if any
164
165 def __init__(self, tag, attrib):
166 self.tag = tag
167 self.attrib = attrib
168 self._children = []
169
170 def __repr__(self):
171 return "<Element %s at %x>" % (self.tag, id(self))
172
173 ##
174 # Creates a new element object of the same type as this element.
175 #
176 # @param tag Element tag.
177 # @param attrib Element attributes, given as a dictionary.
178 # @return A new element instance.
179
180 def makeelement(self, tag, attrib):
181 return Element(tag, attrib)
182
183 ##
184 # Returns the number of subelements.
185 #
186 # @return The number of subelements.
187
188 def __len__(self):
189 return len(self._children)
190
191 ##
192 # Returns the given subelement.
193 #
194 # @param index What subelement to return.
195 # @return The given subelement.
196 # @exception IndexError If the given element does not exist.
197
198 def __getitem__(self, index):
199 return self._children[index]
200
201 ##
202 # Replaces the given subelement.
203 #
204 # @param index What subelement to replace.
205 # @param element The new element value.
206 # @exception IndexError If the given element does not exist.
207 # @exception AssertionError If element is not a valid object.
208
209 def __setitem__(self, index, element):
210 assert iselement(element)
211 self._children[index] = element
212
213 ##
214 # Deletes the given subelement.
215 #
216 # @param index What subelement to delete.
217 # @exception IndexError If the given element does not exist.
218
219 def __delitem__(self, index):
220 del self._children[index]
221
222 ##
223 # Returns a list containing subelements in the given range.
224 #
225 # @param start The first subelement to return.
226 # @param stop The first subelement that shouldn't be returned.
227 # @return A sequence object containing subelements.
228
229 def __getslice__(self, start, stop):
230 return self._children[start:stop]
231
232 ##
233 # Replaces a number of subelements with elements from a sequence.
234 #
235 # @param start The first subelement to replace.
236 # @param stop The first subelement that shouldn't be replaced.
237 # @param elements A sequence object with zero or more elements.
238 # @exception AssertionError If a sequence member is not a valid object.
239
240 def __setslice__(self, start, stop, elements):
241 for element in elements:
242 assert iselement(element)
243 self._children[start:stop] = list(elements)
244
245 ##
246 # Deletes a number of subelements.
247 #
248 # @param start The first subelement to delete.
249 # @param stop The first subelement to leave in there.
250
251 def __delslice__(self, start, stop):
252 del self._children[start:stop]
253
254 ##
255 # Adds a subelement to the end of this element.
256 #
257 # @param element The element to add.
258 # @exception AssertionError If a sequence member is not a valid object.
259
260 def append(self, element):
261 assert iselement(element)
262 self._children.append(element)
263
264 ##
265 # Inserts a subelement at the given position in this element.
266 #
267 # @param index Where to insert the new subelement.
268 # @exception AssertionError If the element is not a valid object.
269
270 def insert(self, index, element):
271 assert iselement(element)
272 self._children.insert(index, element)
273
274 ##
275 # Removes a matching subelement. Unlike the <b>find</b> methods,
276 # this method compares elements based on identity, not on tag
277 # value or contents.
278 #
279 # @param element What element to remove.
280 # @exception ValueError If a matching element could not be found.
281 # @exception AssertionError If the element is not a valid object.
282
283 def remove(self, element):
284 assert iselement(element)
285 self._children.remove(element)
286
287 ##
288 # Returns all subelements. The elements are returned in document
289 # order.
290 #
291 # @return A list of subelements.
292 # @defreturn list of Element instances
293
294 def getchildren(self):
295 return self._children
296
297 ##
298 # Finds the first matching subelement, by tag name or path.
299 #
300 # @param path What element to look for.
301 # @return The first matching element, or None if no element was found.
302 # @defreturn Element or None
303
304 def find(self, path):
305 return ElementPath.find(self, path)
306
307 ##
308 # Finds text for the first matching subelement, by tag name or path.
309 #
310 # @param path What element to look for.
311 # @param default What to return if the element was not found.
312 # @return The text content of the first matching element, or the
313 # default value no element was found. Note that if the element
314 # has is found, but has no text content, this method returns an
315 # empty string.
316 # @defreturn string
317
318 def findtext(self, path, default=None):
319 return ElementPath.findtext(self, path, default)
320
321 ##
322 # Finds all matching subelements, by tag name or path.
323 #
324 # @param path What element to look for.
325 # @return A list or iterator containing all matching elements,
326 # in document order.
327 # @defreturn list of Element instances
328
329 def findall(self, path):
330 return ElementPath.findall(self, path)
331
332 ##
333 # Resets an element. This function removes all subelements, clears
334 # all attributes, and sets the text and tail attributes to None.
335
336 def clear(self):
337 self.attrib.clear()
338 self._children = []
339 self.text = self.tail = None
340
341 ##
342 # Gets an element attribute.
343 #
344 # @param key What attribute to look for.
345 # @param default What to return if the attribute was not found.
346 # @return The attribute value, or the default value, if the
347 # attribute was not found.
348 # @defreturn string or None
349
350 def get(self, key, default=None):
351 return self.attrib.get(key, default)
352
353 ##
354 # Sets an element attribute.
355 #
356 # @param key What attribute to set.
357 # @param value The attribute value.
358
359 def set(self, key, value):
360 self.attrib[key] = value
361
362 ##
363 # Gets a list of attribute names. The names are returned in an
364 # arbitrary order (just like for an ordinary Python dictionary).
365 #
366 # @return A list of element attribute names.
367 # @defreturn list of strings
368
369 def keys(self):
370 return self.attrib.keys()
371
372 ##
373 # Gets element attributes, as a sequence. The attributes are
374 # returned in an arbitrary order.
375 #
376 # @return A list of (name, value) tuples for all attributes.
377 # @defreturn list of (string, string) tuples
378
379 def items(self):
380 return self.attrib.items()
381
382 ##
383 # Creates a tree iterator. The iterator loops over this element
384 # and all subelements, in document order, and returns all elements
385 # with a matching tag.
386 # <p>
387 # If the tree structure is modified during iteration, the result
388 # is undefined.
389 #
390 # @param tag What tags to look for (default is to return all elements).
391 # @return A list or iterator containing all the matching elements.
392 # @defreturn list or iterator
393
394 def getiterator(self, tag=None):
395 nodes = []
396 if tag == "*":
397 tag = None
398 if tag is None or self.tag == tag:
399 nodes.append(self)
400 for node in self._children:
401 nodes.extend(node.getiterator(tag))
402 return nodes
403
404# compatibility
405_Element = _ElementInterface
406
407##
408# Element factory. This function returns an object implementing the
409# standard Element interface. The exact class or type of that object
410# is implementation dependent, but it will always be compatible with
411# the {@link #_ElementInterface} class in this module.
412# <p>
413# The element name, attribute names, and attribute values can be
414# either 8-bit ASCII strings or Unicode strings.
415#
416# @param tag The element name.
417# @param attrib An optional dictionary, containing element attributes.
418# @param **extra Additional attributes, given as keyword arguments.
419# @return An element instance.
420# @defreturn Element
421
422def Element(tag, attrib={}, **extra):
423 attrib = attrib.copy()
424 attrib.update(extra)
425 return _ElementInterface(tag, attrib)
426
427##
428# Subelement factory. This function creates an element instance, and
429# appends it to an existing element.
430# <p>
431# The element name, attribute names, and attribute values can be
432# either 8-bit ASCII strings or Unicode strings.
433#
434# @param parent The parent element.
435# @param tag The subelement name.
436# @param attrib An optional dictionary, containing element attributes.
437# @param **extra Additional attributes, given as keyword arguments.
438# @return An element instance.
439# @defreturn Element
440
441def SubElement(parent, tag, attrib={}, **extra):
442 attrib = attrib.copy()
443 attrib.update(extra)
444 element = parent.makeelement(tag, attrib)
445 parent.append(element)
446 return element
447
448##
449# Comment element factory. This factory function creates a special
450# element that will be serialized as an XML comment.
451# <p>
452# The comment string can be either an 8-bit ASCII string or a Unicode
453# string.
454#
455# @param text A string containing the comment string.
456# @return An element instance, representing a comment.
457# @defreturn Element
458
459def Comment(text=None):
460 element = Element(Comment)
461 element.text = text
462 return element
463
464##
465# PI element factory. This factory function creates a special element
466# that will be serialized as an XML processing instruction.
467#
468# @param target A string containing the PI target.
469# @param text A string containing the PI contents, if any.
470# @return An element instance, representing a PI.
471# @defreturn Element
472
473def ProcessingInstruction(target, text=None):
474 element = Element(ProcessingInstruction)
475 element.text = target
476 if text:
477 element.text = element.text + " " + text
478 return element
479
480PI = ProcessingInstruction
481
482##
483# QName wrapper. This can be used to wrap a QName attribute value, in
484# order to get proper namespace handling on output.
485#
486# @param text A string containing the QName value, in the form {uri}local,
487# or, if the tag argument is given, the URI part of a QName.
488# @param tag Optional tag. If given, the first argument is interpreted as
489# an URI, and this argument is interpreted as a local name.
490# @return An opaque object, representing the QName.
491
492class QName:
493 def __init__(self, text_or_uri, tag=None):
494 if tag:
495 text_or_uri = "{%s}%s" % (text_or_uri, tag)
496 self.text = text_or_uri
497 def __str__(self):
498 return self.text
499 def __hash__(self):
500 return hash(self.text)
501 def __cmp__(self, other):
502 if isinstance(other, QName):
503 return cmp(self.text, other.text)
504 return cmp(self.text, other)
505
506##
507# ElementTree wrapper class. This class represents an entire element
508# hierarchy, and adds some extra support for serialization to and from
509# standard XML.
510#
511# @param element Optional root element.
512# @keyparam file Optional file handle or name. If given, the
513# tree is initialized with the contents of this XML file.
514
515class ElementTree:
516
517 def __init__(self, element=None, file=None):
518 assert element is None or iselement(element)
519 self._root = element # first node
520 if file:
521 self.parse(file)
522
523 ##
524 # Gets the root element for this tree.
525 #
526 # @return An element instance.
527 # @defreturn Element
528
529 def getroot(self):
530 return self._root
531
532 ##
533 # Replaces the root element for this tree. This discards the
534 # current contents of the tree, and replaces it with the given
535 # element. Use with care.
536 #
537 # @param element An element instance.
538
539 def _setroot(self, element):
540 assert iselement(element)
541 self._root = element
542
543 ##
544 # Loads an external XML document into this element tree.
545 #
546 # @param source A file name or file object.
547 # @param parser An optional parser instance. If not given, the
548 # standard {@link XMLTreeBuilder} parser is used.
549 # @return The document root element.
550 # @defreturn Element
551
552 def parse(self, source, parser=None):
553 if not hasattr(source, "read"):
554 source = open(source, "rb")
555 if not parser:
556 parser = XMLTreeBuilder()
557 while 1:
558 data = source.read(32768)
559 if not data:
560 break
561 parser.feed(data)
562 self._root = parser.close()
563 return self._root
564
565 ##
566 # Creates a tree iterator for the root element. The iterator loops
567 # over all elements in this tree, in document order.
568 #
569 # @param tag What tags to look for (default is to return all elements)
570 # @return An iterator.
571 # @defreturn iterator
572
573 def getiterator(self, tag=None):
574 assert self._root is not None
575 return self._root.getiterator(tag)
576
577 ##
578 # Finds the first toplevel element with given tag.
579 # Same as getroot().find(path).
580 #
581 # @param path What element to look for.
582 # @return The first matching element, or None if no element was found.
583 # @defreturn Element or None
584
585 def find(self, path):
586 assert self._root is not None
587 if path[:1] == "/":
588 path = "." + path
589 return self._root.find(path)
590
591 ##
592 # Finds the element text for the first toplevel element with given
593 # tag. Same as getroot().findtext(path).
594 #
595 # @param path What toplevel element to look for.
596 # @param default What to return if the element was not found.
597 # @return The text content of the first matching element, or the
598 # default value no element was found. Note that if the element
599 # has is found, but has no text content, this method returns an
600 # empty string.
601 # @defreturn string
602
603 def findtext(self, path, default=None):
604 assert self._root is not None
605 if path[:1] == "/":
606 path = "." + path
607 return self._root.findtext(path, default)
608
609 ##
610 # Finds all toplevel elements with the given tag.
611 # Same as getroot().findall(path).
612 #
613 # @param path What element to look for.
614 # @return A list or iterator containing all matching elements,
615 # in document order.
616 # @defreturn list of Element instances
617
618 def findall(self, path):
619 assert self._root is not None
620 if path[:1] == "/":
621 path = "." + path
622 return self._root.findall(path)
623
624 ##
625 # Writes the element tree to a file, as XML.
626 #
627 # @param file A file name, or a file object opened for writing.
Guido van Rossum34d19282007-08-09 01:03:29 +0000628 # @param encoding Optional output encoding (default is None)
Armin Rigo9ed73062005-12-14 18:10:45 +0000629
Guido van Rossum34d19282007-08-09 01:03:29 +0000630 def write(self, file, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +0000631 assert self._root is not None
632 if not hasattr(file, "write"):
Guido van Rossum34d19282007-08-09 01:03:29 +0000633 if encoding:
634 file = open(file, "wb")
635 else:
636 file = open(file, "w")
637 if encoding and encoding != "utf-8":
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000638 file.write(_encode("<?xml version='1.0' encoding='%s'?>\n" % encoding, encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000639 self._write(file, self._root, encoding, {})
640
641 def _write(self, file, node, encoding, namespaces):
642 # write XML to file
643 tag = node.tag
644 if tag is Comment:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000645 file.write(_encode("<!-- %s -->" % _escape_cdata(node.text), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000646 elif tag is ProcessingInstruction:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000647 file.write(_encode("<?%s?>" % _escape_cdata(node.text), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000648 else:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000649 items = list(node.items())
Armin Rigo9ed73062005-12-14 18:10:45 +0000650 xmlns_items = [] # new namespaces in this scope
651 try:
652 if isinstance(tag, QName) or tag[:1] == "{":
653 tag, xmlns = fixtag(tag, namespaces)
654 if xmlns: xmlns_items.append(xmlns)
655 except TypeError:
656 _raise_serialization_error(tag)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000657 file.write(_encode("<" + tag, encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000658 if items or xmlns_items:
659 items.sort() # lexical order
660 for k, v in items:
661 try:
662 if isinstance(k, QName) or k[:1] == "{":
663 k, xmlns = fixtag(k, namespaces)
664 if xmlns: xmlns_items.append(xmlns)
665 except TypeError:
666 _raise_serialization_error(k)
667 try:
668 if isinstance(v, QName):
669 v, xmlns = fixtag(v, namespaces)
670 if xmlns: xmlns_items.append(xmlns)
671 except TypeError:
672 _raise_serialization_error(v)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000673 file.write(_encode(" %s=\"%s\"" % (k, _escape_attrib(v)), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000674 for k, v in xmlns_items:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000675 file.write(_encode(" %s=\"%s\"" % (k, _escape_attrib(v)), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000676 if node.text or len(node):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000677 file.write(_encode(">", encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000678 if node.text:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000679 file.write(_encode(_escape_cdata(node.text), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000680 for n in node:
681 self._write(file, n, encoding, namespaces)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000682 file.write(_encode("</" + tag + ">", encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000683 else:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000684 file.write(_encode(" />", encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000685 for k, v in xmlns_items:
686 del namespaces[v]
687 if node.tail:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000688 file.write(_encode(_escape_cdata(node.tail), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000689
690# --------------------------------------------------------------------
691# helpers
692
693##
694# Checks if an object appears to be a valid element object.
695#
696# @param An element instance.
697# @return A true value if this is an element object.
698# @defreturn flag
699
700def iselement(element):
701 # FIXME: not sure about this; might be a better idea to look
702 # for tag/attrib/text attributes
703 return isinstance(element, _ElementInterface) or hasattr(element, "tag")
704
705##
706# Writes an element tree or element structure to sys.stdout. This
707# function should be used for debugging only.
708# <p>
709# The exact output format is implementation dependent. In this
710# version, it's written as an ordinary XML file.
711#
712# @param elem An element tree or an individual element.
713
714def dump(elem):
715 # debugging
716 if not isinstance(elem, ElementTree):
717 elem = ElementTree(elem)
718 elem.write(sys.stdout)
719 tail = elem.getroot().tail
720 if not tail or tail[-1] != "\n":
721 sys.stdout.write("\n")
722
723def _encode(s, encoding):
Guido van Rossum34d19282007-08-09 01:03:29 +0000724 if encoding:
Armin Rigo9ed73062005-12-14 18:10:45 +0000725 return s.encode(encoding)
Guido van Rossum34d19282007-08-09 01:03:29 +0000726 else:
727 return s
Armin Rigo9ed73062005-12-14 18:10:45 +0000728
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000729_escape = re.compile(r"[&<>\"\u0080-\uffff]+")
Armin Rigo9ed73062005-12-14 18:10:45 +0000730
731_escape_map = {
732 "&": "&amp;",
733 "<": "&lt;",
734 ">": "&gt;",
735 '"': "&quot;",
736}
737
738_namespace_map = {
739 # "well-known" namespace prefixes
740 "http://www.w3.org/XML/1998/namespace": "xml",
741 "http://www.w3.org/1999/xhtml": "html",
742 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
743 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
744}
745
746def _raise_serialization_error(text):
747 raise TypeError(
748 "cannot serialize %r (type %s)" % (text, type(text).__name__)
749 )
750
751def _encode_entity(text, pattern=_escape):
752 # map reserved and non-ascii characters to numerical entities
753 def escape_entities(m, map=_escape_map):
754 out = []
755 append = out.append
756 for char in m.group():
757 text = map.get(char)
758 if text is None:
759 text = "&#%d;" % ord(char)
760 append(text)
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000761 return "".join(out)
Armin Rigo9ed73062005-12-14 18:10:45 +0000762 try:
763 return _encode(pattern.sub(escape_entities, text), "ascii")
764 except TypeError:
765 _raise_serialization_error(text)
766
767#
768# the following functions assume an ascii-compatible encoding
769# (or "utf-16")
770
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000771def _escape_cdata(text):
Armin Rigo9ed73062005-12-14 18:10:45 +0000772 # escape character data
773 try:
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000774 text = text.replace("&", "&amp;")
775 text = text.replace("<", "&lt;")
776 text = text.replace(">", "&gt;")
Armin Rigo9ed73062005-12-14 18:10:45 +0000777 return text
778 except (TypeError, AttributeError):
779 _raise_serialization_error(text)
780
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000781def _escape_attrib(text):
Armin Rigo9ed73062005-12-14 18:10:45 +0000782 # escape attribute value
783 try:
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000784 text = text.replace("&", "&amp;")
785 text = text.replace("'", "&apos;") # FIXME: overkill
786 text = text.replace("\"", "&quot;")
787 text = text.replace("<", "&lt;")
788 text = text.replace(">", "&gt;")
Armin Rigo9ed73062005-12-14 18:10:45 +0000789 return text
790 except (TypeError, AttributeError):
791 _raise_serialization_error(text)
792
793def fixtag(tag, namespaces):
794 # given a decorated tag (of the form {uri}tag), return prefixed
795 # tag and namespace declaration, if any
796 if isinstance(tag, QName):
797 tag = tag.text
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000798 namespace_uri, tag = tag[1:].split("}", 1)
Armin Rigo9ed73062005-12-14 18:10:45 +0000799 prefix = namespaces.get(namespace_uri)
800 if prefix is None:
801 prefix = _namespace_map.get(namespace_uri)
802 if prefix is None:
803 prefix = "ns%d" % len(namespaces)
804 namespaces[namespace_uri] = prefix
805 if prefix == "xml":
806 xmlns = None
807 else:
808 xmlns = ("xmlns:%s" % prefix, namespace_uri)
809 else:
810 xmlns = None
811 return "%s:%s" % (prefix, tag), xmlns
812
813##
814# Parses an XML document into an element tree.
815#
816# @param source A filename or file object containing XML data.
817# @param parser An optional parser instance. If not given, the
818# standard {@link XMLTreeBuilder} parser is used.
819# @return An ElementTree instance
820
821def parse(source, parser=None):
822 tree = ElementTree()
823 tree.parse(source, parser)
824 return tree
825
826##
827# Parses an XML document into an element tree incrementally, and reports
828# what's going on to the user.
829#
830# @param source A filename or file object containing XML data.
831# @param events A list of events to report back. If omitted, only "end"
832# events are reported.
833# @return A (event, elem) iterator.
834
835class iterparse:
836
837 def __init__(self, source, events=None):
838 if not hasattr(source, "read"):
839 source = open(source, "rb")
840 self._file = source
841 self._events = []
842 self._index = 0
843 self.root = self._root = None
844 self._parser = XMLTreeBuilder()
845 # wire up the parser for event reporting
846 parser = self._parser._parser
847 append = self._events.append
848 if events is None:
849 events = ["end"]
850 for event in events:
851 if event == "start":
852 try:
853 parser.ordered_attributes = 1
854 parser.specified_attributes = 1
855 def handler(tag, attrib_in, event=event, append=append,
856 start=self._parser._start_list):
857 append((event, start(tag, attrib_in)))
858 parser.StartElementHandler = handler
859 except AttributeError:
860 def handler(tag, attrib_in, event=event, append=append,
861 start=self._parser._start):
862 append((event, start(tag, attrib_in)))
863 parser.StartElementHandler = handler
864 elif event == "end":
865 def handler(tag, event=event, append=append,
866 end=self._parser._end):
867 append((event, end(tag)))
868 parser.EndElementHandler = handler
869 elif event == "start-ns":
870 def handler(prefix, uri, event=event, append=append):
871 try:
872 uri = _encode(uri, "ascii")
873 except UnicodeError:
874 pass
875 append((event, (prefix or "", uri)))
876 parser.StartNamespaceDeclHandler = handler
877 elif event == "end-ns":
878 def handler(prefix, event=event, append=append):
879 append((event, None))
880 parser.EndNamespaceDeclHandler = handler
881
Georg Brandla18af4e2007-04-21 15:47:16 +0000882 def __next__(self):
Armin Rigo9ed73062005-12-14 18:10:45 +0000883 while 1:
884 try:
885 item = self._events[self._index]
886 except IndexError:
887 if self._parser is None:
888 self.root = self._root
889 try:
890 raise StopIteration
891 except NameError:
892 raise IndexError
893 # load event buffer
894 del self._events[:]
895 self._index = 0
896 data = self._file.read(16384)
897 if data:
898 self._parser.feed(data)
899 else:
900 self._root = self._parser.close()
901 self._parser = None
902 else:
903 self._index = self._index + 1
904 return item
905
906 try:
907 iter
908 def __iter__(self):
909 return self
910 except NameError:
911 def __getitem__(self, index):
Georg Brandla18af4e2007-04-21 15:47:16 +0000912 return self.__next__()
Armin Rigo9ed73062005-12-14 18:10:45 +0000913
914##
915# Parses an XML document from a string constant. This function can
916# be used to embed "XML literals" in Python code.
917#
918# @param source A string containing XML data.
919# @return An Element instance.
920# @defreturn Element
921
922def XML(text):
923 parser = XMLTreeBuilder()
924 parser.feed(text)
925 return parser.close()
926
927##
928# Parses an XML document from a string constant, and also returns
929# a dictionary which maps from element id:s to elements.
930#
931# @param source A string containing XML data.
932# @return A tuple containing an Element instance and a dictionary.
933# @defreturn (Element, dictionary)
934
935def XMLID(text):
936 parser = XMLTreeBuilder()
937 parser.feed(text)
938 tree = parser.close()
939 ids = {}
940 for elem in tree.getiterator():
941 id = elem.get("id")
942 if id:
943 ids[id] = elem
944 return tree, ids
945
946##
947# Parses an XML document from a string constant. Same as {@link #XML}.
948#
949# @def fromstring(text)
950# @param source A string containing XML data.
951# @return An Element instance.
952# @defreturn Element
953
954fromstring = XML
955
956##
957# Generates a string representation of an XML element, including all
Guido van Rossum34d19282007-08-09 01:03:29 +0000958# subelements. If encoding is None, the return type is a string;
959# otherwise it is a bytes array.
Armin Rigo9ed73062005-12-14 18:10:45 +0000960#
961# @param element An Element instance.
Guido van Rossum34d19282007-08-09 01:03:29 +0000962# @return An (optionally) encoded string containing the XML data.
Armin Rigo9ed73062005-12-14 18:10:45 +0000963# @defreturn string
964
965def tostring(element, encoding=None):
966 class dummy:
967 pass
968 data = []
969 file = dummy()
970 file.write = data.append
971 ElementTree(element).write(file, encoding)
Guido van Rossum34d19282007-08-09 01:03:29 +0000972 if encoding:
973 return b"".join(data)
974 else:
975 return "".join(data)
Armin Rigo9ed73062005-12-14 18:10:45 +0000976
977##
978# Generic element structure builder. This builder converts a sequence
979# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
980# #TreeBuilder.end} method calls to a well-formed element structure.
981# <p>
982# You can use this class to build an element structure using a custom XML
983# parser, or a parser for some other XML-like format.
984#
985# @param element_factory Optional element factory. This factory
986# is called to create new Element instances, as necessary.
987
988class TreeBuilder:
989
990 def __init__(self, element_factory=None):
991 self._data = [] # data collector
992 self._elem = [] # element stack
993 self._last = None # last element
994 self._tail = None # true if we're after an end tag
995 if element_factory is None:
996 element_factory = _ElementInterface
997 self._factory = element_factory
998
999 ##
1000 # Flushes the parser buffers, and returns the toplevel documen
1001 # element.
1002 #
1003 # @return An Element instance.
1004 # @defreturn Element
1005
1006 def close(self):
1007 assert len(self._elem) == 0, "missing end tags"
1008 assert self._last != None, "missing toplevel element"
1009 return self._last
1010
1011 def _flush(self):
1012 if self._data:
1013 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001014 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001015 if self._tail:
1016 assert self._last.tail is None, "internal error (tail)"
1017 self._last.tail = text
1018 else:
1019 assert self._last.text is None, "internal error (text)"
1020 self._last.text = text
1021 self._data = []
1022
1023 ##
1024 # Adds text to the current element.
1025 #
1026 # @param data A string. This should be either an 8-bit string
1027 # containing ASCII text, or a Unicode string.
1028
1029 def data(self, data):
1030 self._data.append(data)
1031
1032 ##
1033 # Opens a new element.
1034 #
1035 # @param tag The element name.
1036 # @param attrib A dictionary containing element attributes.
1037 # @return The opened element.
1038 # @defreturn Element
1039
1040 def start(self, tag, attrs):
1041 self._flush()
1042 self._last = elem = self._factory(tag, attrs)
1043 if self._elem:
1044 self._elem[-1].append(elem)
1045 self._elem.append(elem)
1046 self._tail = 0
1047 return elem
1048
1049 ##
1050 # Closes the current element.
1051 #
1052 # @param tag The element name.
1053 # @return The closed element.
1054 # @defreturn Element
1055
1056 def end(self, tag):
1057 self._flush()
1058 self._last = self._elem.pop()
1059 assert self._last.tag == tag,\
1060 "end tag mismatch (expected %s, got %s)" % (
1061 self._last.tag, tag)
1062 self._tail = 1
1063 return self._last
1064
1065##
1066# Element structure builder for XML source data, based on the
1067# <b>expat</b> parser.
1068#
1069# @keyparam target Target object. If omitted, the builder uses an
1070# instance of the standard {@link #TreeBuilder} class.
1071# @keyparam html Predefine HTML entities. This flag is not supported
1072# by the current implementation.
1073# @see #ElementTree
1074# @see #TreeBuilder
1075
1076class XMLTreeBuilder:
1077
1078 def __init__(self, html=0, target=None):
1079 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001080 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001081 except ImportError:
1082 raise ImportError(
1083 "No module named expat; use SimpleXMLTreeBuilder instead"
1084 )
1085 self._parser = parser = expat.ParserCreate(None, "}")
1086 if target is None:
1087 target = TreeBuilder()
1088 self._target = target
1089 self._names = {} # name memo cache
1090 # callbacks
1091 parser.DefaultHandlerExpand = self._default
1092 parser.StartElementHandler = self._start
1093 parser.EndElementHandler = self._end
1094 parser.CharacterDataHandler = self._data
1095 # let expat do the buffering, if supported
1096 try:
1097 self._parser.buffer_text = 1
1098 except AttributeError:
1099 pass
1100 # use new-style attribute handling, if supported
1101 try:
1102 self._parser.ordered_attributes = 1
1103 self._parser.specified_attributes = 1
1104 parser.StartElementHandler = self._start_list
1105 except AttributeError:
1106 pass
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001107 encoding = "utf-8"
Armin Rigo9ed73062005-12-14 18:10:45 +00001108 # target.xml(encoding, None)
1109 self._doctype = None
1110 self.entity = {}
1111
Armin Rigo9ed73062005-12-14 18:10:45 +00001112 def _fixname(self, key):
1113 # expand qname, and convert name string to ascii, if possible
1114 try:
1115 name = self._names[key]
1116 except KeyError:
1117 name = key
1118 if "}" in name:
1119 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001120 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001121 return name
1122
1123 def _start(self, tag, attrib_in):
1124 fixname = self._fixname
1125 tag = fixname(tag)
1126 attrib = {}
1127 for key, value in attrib_in.items():
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001128 attrib[fixname(key)] = value
Armin Rigo9ed73062005-12-14 18:10:45 +00001129 return self._target.start(tag, attrib)
1130
1131 def _start_list(self, tag, attrib_in):
1132 fixname = self._fixname
1133 tag = fixname(tag)
1134 attrib = {}
1135 if attrib_in:
1136 for i in range(0, len(attrib_in), 2):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001137 attrib[fixname(attrib_in[i])] = attrib_in[i+1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001138 return self._target.start(tag, attrib)
1139
1140 def _data(self, text):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001141 return self._target.data(text)
Armin Rigo9ed73062005-12-14 18:10:45 +00001142
1143 def _end(self, tag):
1144 return self._target.end(self._fixname(tag))
1145
1146 def _default(self, text):
1147 prefix = text[:1]
1148 if prefix == "&":
1149 # deal with undefined entities
1150 try:
1151 self._target.data(self.entity[text[1:-1]])
1152 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001153 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001154 raise expat.error(
1155 "undefined entity %s: line %d, column %d" %
1156 (text, self._parser.ErrorLineNumber,
1157 self._parser.ErrorColumnNumber)
1158 )
1159 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1160 self._doctype = [] # inside a doctype declaration
1161 elif self._doctype is not None:
1162 # parse doctype contents
1163 if prefix == ">":
1164 self._doctype = None
1165 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001166 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001167 if not text:
1168 return
1169 self._doctype.append(text)
1170 n = len(self._doctype)
1171 if n > 2:
1172 type = self._doctype[1]
1173 if type == "PUBLIC" and n == 4:
1174 name, type, pubid, system = self._doctype
1175 elif type == "SYSTEM" and n == 3:
1176 name, type, system = self._doctype
1177 pubid = None
1178 else:
1179 return
1180 if pubid:
1181 pubid = pubid[1:-1]
1182 self.doctype(name, pubid, system[1:-1])
1183 self._doctype = None
1184
1185 ##
1186 # Handles a doctype declaration.
1187 #
1188 # @param name Doctype name.
1189 # @param pubid Public identifier.
1190 # @param system System identifier.
1191
1192 def doctype(self, name, pubid, system):
1193 pass
1194
1195 ##
1196 # Feeds data to the parser.
1197 #
1198 # @param data Encoded data.
1199
1200 def feed(self, data):
1201 self._parser.Parse(data, 0)
1202
1203 ##
1204 # Finishes feeding data to the parser.
1205 #
1206 # @return An element structure.
1207 # @defreturn Element
1208
1209 def close(self):
1210 self._parser.Parse("", 1) # end of data
1211 tree = self._target.close()
1212 del self._target, self._parser # get rid of circular references
1213 return tree
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001214
1215# compatibility
1216XMLParser = XMLTreeBuilder