blob: cfac4f7090fae6a8ede7c97b32c8aae918680dd9 [file] [log] [blame]
Armin Rigo9ed73062005-12-14 18:10:45 +00001#
2# ElementTree
3# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
4#
5# light-weight XML support for Python 1.5.2 and later.
6#
7# history:
8# 2001-10-20 fl created (from various sources)
9# 2001-11-01 fl return root from parse method
10# 2002-02-16 fl sort attributes in lexical order
11# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
12# 2002-05-01 fl finished TreeBuilder refactoring
13# 2002-07-14 fl added basic namespace support to ElementTree.write
14# 2002-07-25 fl added QName attribute support
15# 2002-10-20 fl fixed encoding in write
16# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
17# 2002-11-27 fl accept file objects or file names for parse/write
18# 2002-12-04 fl moved XMLTreeBuilder back to this module
19# 2003-01-11 fl fixed entity encoding glitch for us-ascii
20# 2003-02-13 fl added XML literal factory
21# 2003-02-21 fl added ProcessingInstruction/PI factory
22# 2003-05-11 fl added tostring/fromstring helpers
23# 2003-05-26 fl added ElementPath support
24# 2003-07-05 fl added makeelement factory method
25# 2003-07-28 fl added more well-known namespace prefixes
26# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
27# 2003-09-04 fl fall back on emulator if ElementPath is not installed
28# 2003-10-31 fl markup updates
29# 2003-11-15 fl fixed nested namespace bug
30# 2004-03-28 fl added XMLID helper
31# 2004-06-02 fl added default support to findtext
32# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
33# 2004-08-23 fl take advantage of post-2.1 expat features
34# 2005-02-01 fl added iterparse implementation
35# 2005-03-02 fl fixed iterparse support for pre-2.2 versions
36#
37# Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved.
38#
39# fredrik@pythonware.com
40# http://www.pythonware.com
41#
42# --------------------------------------------------------------------
43# The ElementTree toolkit is
44#
45# Copyright (c) 1999-2005 by Fredrik Lundh
46#
47# By obtaining, using, and/or copying this software and/or its
48# associated documentation, you agree that you have read, understood,
49# and will comply with the following terms and conditions:
50#
51# Permission to use, copy, modify, and distribute this software and
52# its associated documentation for any purpose and without fee is
53# hereby granted, provided that the above copyright notice appears in
54# all copies, and that both that copyright notice and this permission
55# notice appear in supporting documentation, and that the name of
56# Secret Labs AB or the author not be used in advertising or publicity
57# pertaining to distribution of the software without specific, written
58# prior permission.
59#
60# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
61# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
62# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
63# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
64# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
65# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
66# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
67# OF THIS SOFTWARE.
68# --------------------------------------------------------------------
69
Fredrik Lundh63168a52005-12-14 22:29:34 +000070# Licensed to PSF under a Contributor Agreement.
71# See http://www.python.org/2.4/license for licensing details.
72
Armin Rigo9ed73062005-12-14 18:10:45 +000073__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
78 "fromstring",
79 "iselement", "iterparse",
80 "parse",
81 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
84 "tostring",
85 "TreeBuilder",
86 "VERSION", "XML",
Thomas Wouters0e3f5912006-08-11 14:57:12 +000087 "XMLParser", "XMLTreeBuilder",
Armin Rigo9ed73062005-12-14 18:10:45 +000088 ]
89
90##
91# The <b>Element</b> type is a flexible container object, designed to
92# store hierarchical data structures in memory. The type can be
93# described as a cross between a list and a dictionary.
94# <p>
95# Each element has a number of properties associated with it:
96# <ul>
97# <li>a <i>tag</i>. This is a string identifying what kind of data
98# this element represents (the element type, in other words).</li>
99# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
100# <li>a <i>text</i> string.</li>
101# <li>an optional <i>tail</i> string.</li>
102# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
103# </ul>
104#
105# To create an element instance, use the {@link #Element} or {@link
106# #SubElement} factory functions.
107# <p>
108# The {@link #ElementTree} class can be used to wrap an element
109# structure, and convert it from and to XML.
110##
111
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000112import sys, re
Armin Rigo9ed73062005-12-14 18:10:45 +0000113
Alex Martelli6cefeb02006-08-21 23:45:19 +0000114from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000115
116# TODO: add support for custom namespace resolvers/default namespaces
117# TODO: add improved support for incremental parsing
118
119VERSION = "1.2.6"
120
121##
122# Internal element class. This class defines the Element interface,
123# and provides a reference implementation of this interface.
124# <p>
125# You should not create instances of this class directly. Use the
126# appropriate factory functions instead, such as {@link #Element}
127# and {@link #SubElement}.
128#
129# @see Element
130# @see SubElement
131# @see Comment
132# @see ProcessingInstruction
133
134class _ElementInterface:
135 # <tag attrib>text<child/>...</tag>tail
136
137 ##
138 # (Attribute) Element tag.
139
140 tag = None
141
142 ##
143 # (Attribute) Element attribute dictionary. Where possible, use
144 # {@link #_ElementInterface.get},
145 # {@link #_ElementInterface.set},
146 # {@link #_ElementInterface.keys}, and
147 # {@link #_ElementInterface.items} to access
148 # element attributes.
149
150 attrib = None
151
152 ##
153 # (Attribute) Text before first subelement. This is either a
154 # string or the value None, if there was no text.
155
156 text = None
157
158 ##
159 # (Attribute) Text after this element's end tag, but before the
160 # next sibling element's start tag. This is either a string or
161 # the value None, if there was no text.
162
163 tail = None # text after end tag, if any
164
165 def __init__(self, tag, attrib):
166 self.tag = tag
167 self.attrib = attrib
168 self._children = []
169
170 def __repr__(self):
171 return "<Element %s at %x>" % (self.tag, id(self))
172
173 ##
174 # Creates a new element object of the same type as this element.
175 #
176 # @param tag Element tag.
177 # @param attrib Element attributes, given as a dictionary.
178 # @return A new element instance.
179
180 def makeelement(self, tag, attrib):
181 return Element(tag, attrib)
182
183 ##
184 # Returns the number of subelements.
185 #
186 # @return The number of subelements.
187
188 def __len__(self):
189 return len(self._children)
190
191 ##
192 # Returns the given subelement.
193 #
194 # @param index What subelement to return.
195 # @return The given subelement.
196 # @exception IndexError If the given element does not exist.
197
198 def __getitem__(self, index):
199 return self._children[index]
200
201 ##
202 # Replaces the given subelement.
203 #
204 # @param index What subelement to replace.
205 # @param element The new element value.
206 # @exception IndexError If the given element does not exist.
207 # @exception AssertionError If element is not a valid object.
208
209 def __setitem__(self, index, element):
210 assert iselement(element)
211 self._children[index] = element
212
213 ##
214 # Deletes the given subelement.
215 #
216 # @param index What subelement to delete.
217 # @exception IndexError If the given element does not exist.
218
219 def __delitem__(self, index):
220 del self._children[index]
221
222 ##
223 # Returns a list containing subelements in the given range.
224 #
225 # @param start The first subelement to return.
226 # @param stop The first subelement that shouldn't be returned.
227 # @return A sequence object containing subelements.
228
229 def __getslice__(self, start, stop):
230 return self._children[start:stop]
231
232 ##
233 # Replaces a number of subelements with elements from a sequence.
234 #
235 # @param start The first subelement to replace.
236 # @param stop The first subelement that shouldn't be replaced.
237 # @param elements A sequence object with zero or more elements.
238 # @exception AssertionError If a sequence member is not a valid object.
239
240 def __setslice__(self, start, stop, elements):
241 for element in elements:
242 assert iselement(element)
243 self._children[start:stop] = list(elements)
244
245 ##
246 # Deletes a number of subelements.
247 #
248 # @param start The first subelement to delete.
249 # @param stop The first subelement to leave in there.
250
251 def __delslice__(self, start, stop):
252 del self._children[start:stop]
253
254 ##
255 # Adds a subelement to the end of this element.
256 #
257 # @param element The element to add.
258 # @exception AssertionError If a sequence member is not a valid object.
259
260 def append(self, element):
261 assert iselement(element)
262 self._children.append(element)
263
264 ##
265 # Inserts a subelement at the given position in this element.
266 #
267 # @param index Where to insert the new subelement.
268 # @exception AssertionError If the element is not a valid object.
269
270 def insert(self, index, element):
271 assert iselement(element)
272 self._children.insert(index, element)
273
274 ##
275 # Removes a matching subelement. Unlike the <b>find</b> methods,
276 # this method compares elements based on identity, not on tag
277 # value or contents.
278 #
279 # @param element What element to remove.
280 # @exception ValueError If a matching element could not be found.
281 # @exception AssertionError If the element is not a valid object.
282
283 def remove(self, element):
284 assert iselement(element)
285 self._children.remove(element)
286
287 ##
288 # Returns all subelements. The elements are returned in document
289 # order.
290 #
291 # @return A list of subelements.
292 # @defreturn list of Element instances
293
294 def getchildren(self):
295 return self._children
296
297 ##
298 # Finds the first matching subelement, by tag name or path.
299 #
300 # @param path What element to look for.
301 # @return The first matching element, or None if no element was found.
302 # @defreturn Element or None
303
304 def find(self, path):
305 return ElementPath.find(self, path)
306
307 ##
308 # Finds text for the first matching subelement, by tag name or path.
309 #
310 # @param path What element to look for.
311 # @param default What to return if the element was not found.
312 # @return The text content of the first matching element, or the
313 # default value no element was found. Note that if the element
314 # has is found, but has no text content, this method returns an
315 # empty string.
316 # @defreturn string
317
318 def findtext(self, path, default=None):
319 return ElementPath.findtext(self, path, default)
320
321 ##
322 # Finds all matching subelements, by tag name or path.
323 #
324 # @param path What element to look for.
325 # @return A list or iterator containing all matching elements,
326 # in document order.
327 # @defreturn list of Element instances
328
329 def findall(self, path):
330 return ElementPath.findall(self, path)
331
332 ##
333 # Resets an element. This function removes all subelements, clears
334 # all attributes, and sets the text and tail attributes to None.
335
336 def clear(self):
337 self.attrib.clear()
338 self._children = []
339 self.text = self.tail = None
340
341 ##
342 # Gets an element attribute.
343 #
344 # @param key What attribute to look for.
345 # @param default What to return if the attribute was not found.
346 # @return The attribute value, or the default value, if the
347 # attribute was not found.
348 # @defreturn string or None
349
350 def get(self, key, default=None):
351 return self.attrib.get(key, default)
352
353 ##
354 # Sets an element attribute.
355 #
356 # @param key What attribute to set.
357 # @param value The attribute value.
358
359 def set(self, key, value):
360 self.attrib[key] = value
361
362 ##
363 # Gets a list of attribute names. The names are returned in an
364 # arbitrary order (just like for an ordinary Python dictionary).
365 #
366 # @return A list of element attribute names.
367 # @defreturn list of strings
368
369 def keys(self):
370 return self.attrib.keys()
371
372 ##
373 # Gets element attributes, as a sequence. The attributes are
374 # returned in an arbitrary order.
375 #
376 # @return A list of (name, value) tuples for all attributes.
377 # @defreturn list of (string, string) tuples
378
379 def items(self):
380 return self.attrib.items()
381
382 ##
383 # Creates a tree iterator. The iterator loops over this element
384 # and all subelements, in document order, and returns all elements
385 # with a matching tag.
386 # <p>
387 # If the tree structure is modified during iteration, the result
388 # is undefined.
389 #
390 # @param tag What tags to look for (default is to return all elements).
391 # @return A list or iterator containing all the matching elements.
392 # @defreturn list or iterator
393
394 def getiterator(self, tag=None):
395 nodes = []
396 if tag == "*":
397 tag = None
398 if tag is None or self.tag == tag:
399 nodes.append(self)
400 for node in self._children:
401 nodes.extend(node.getiterator(tag))
402 return nodes
403
404# compatibility
405_Element = _ElementInterface
406
407##
408# Element factory. This function returns an object implementing the
409# standard Element interface. The exact class or type of that object
410# is implementation dependent, but it will always be compatible with
411# the {@link #_ElementInterface} class in this module.
412# <p>
413# The element name, attribute names, and attribute values can be
414# either 8-bit ASCII strings or Unicode strings.
415#
416# @param tag The element name.
417# @param attrib An optional dictionary, containing element attributes.
418# @param **extra Additional attributes, given as keyword arguments.
419# @return An element instance.
420# @defreturn Element
421
422def Element(tag, attrib={}, **extra):
423 attrib = attrib.copy()
424 attrib.update(extra)
425 return _ElementInterface(tag, attrib)
426
427##
428# Subelement factory. This function creates an element instance, and
429# appends it to an existing element.
430# <p>
431# The element name, attribute names, and attribute values can be
432# either 8-bit ASCII strings or Unicode strings.
433#
434# @param parent The parent element.
435# @param tag The subelement name.
436# @param attrib An optional dictionary, containing element attributes.
437# @param **extra Additional attributes, given as keyword arguments.
438# @return An element instance.
439# @defreturn Element
440
441def SubElement(parent, tag, attrib={}, **extra):
442 attrib = attrib.copy()
443 attrib.update(extra)
444 element = parent.makeelement(tag, attrib)
445 parent.append(element)
446 return element
447
448##
449# Comment element factory. This factory function creates a special
450# element that will be serialized as an XML comment.
451# <p>
452# The comment string can be either an 8-bit ASCII string or a Unicode
453# string.
454#
455# @param text A string containing the comment string.
456# @return An element instance, representing a comment.
457# @defreturn Element
458
459def Comment(text=None):
460 element = Element(Comment)
461 element.text = text
462 return element
463
464##
465# PI element factory. This factory function creates a special element
466# that will be serialized as an XML processing instruction.
467#
468# @param target A string containing the PI target.
469# @param text A string containing the PI contents, if any.
470# @return An element instance, representing a PI.
471# @defreturn Element
472
473def ProcessingInstruction(target, text=None):
474 element = Element(ProcessingInstruction)
475 element.text = target
476 if text:
477 element.text = element.text + " " + text
478 return element
479
480PI = ProcessingInstruction
481
482##
483# QName wrapper. This can be used to wrap a QName attribute value, in
484# order to get proper namespace handling on output.
485#
486# @param text A string containing the QName value, in the form {uri}local,
487# or, if the tag argument is given, the URI part of a QName.
488# @param tag Optional tag. If given, the first argument is interpreted as
489# an URI, and this argument is interpreted as a local name.
490# @return An opaque object, representing the QName.
491
492class QName:
493 def __init__(self, text_or_uri, tag=None):
494 if tag:
495 text_or_uri = "{%s}%s" % (text_or_uri, tag)
496 self.text = text_or_uri
497 def __str__(self):
498 return self.text
499 def __hash__(self):
500 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000501 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000502 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000503 return self.text <= other.text
504 return self.text <= other
505 def __lt__(self, other):
506 if isinstance(other, QName):
507 return self.text < other.text
508 return self.text < other
509 def __ge__(self, other):
510 if isinstance(other, QName):
511 return self.text >= other.text
512 return self.text >= other
513 def __gt__(self, other):
514 if isinstance(other, QName):
515 return self.text > other.text
516 return self.text > other
517 def __eq__(self, other):
518 if isinstance(other, QName):
519 return self.text == other.text
520 return self.text == other
521 def __ne__(self, other):
522 if isinstance(other, QName):
523 return self.text != other.text
524 return self.text != other
Armin Rigo9ed73062005-12-14 18:10:45 +0000525
526##
527# ElementTree wrapper class. This class represents an entire element
528# hierarchy, and adds some extra support for serialization to and from
529# standard XML.
530#
531# @param element Optional root element.
532# @keyparam file Optional file handle or name. If given, the
533# tree is initialized with the contents of this XML file.
534
535class ElementTree:
536
537 def __init__(self, element=None, file=None):
538 assert element is None or iselement(element)
539 self._root = element # first node
540 if file:
541 self.parse(file)
542
543 ##
544 # Gets the root element for this tree.
545 #
546 # @return An element instance.
547 # @defreturn Element
548
549 def getroot(self):
550 return self._root
551
552 ##
553 # Replaces the root element for this tree. This discards the
554 # current contents of the tree, and replaces it with the given
555 # element. Use with care.
556 #
557 # @param element An element instance.
558
559 def _setroot(self, element):
560 assert iselement(element)
561 self._root = element
562
563 ##
564 # Loads an external XML document into this element tree.
565 #
566 # @param source A file name or file object.
567 # @param parser An optional parser instance. If not given, the
568 # standard {@link XMLTreeBuilder} parser is used.
569 # @return The document root element.
570 # @defreturn Element
571
572 def parse(self, source, parser=None):
573 if not hasattr(source, "read"):
574 source = open(source, "rb")
575 if not parser:
576 parser = XMLTreeBuilder()
577 while 1:
578 data = source.read(32768)
579 if not data:
580 break
581 parser.feed(data)
582 self._root = parser.close()
583 return self._root
584
585 ##
586 # Creates a tree iterator for the root element. The iterator loops
587 # over all elements in this tree, in document order.
588 #
589 # @param tag What tags to look for (default is to return all elements)
590 # @return An iterator.
591 # @defreturn iterator
592
593 def getiterator(self, tag=None):
594 assert self._root is not None
595 return self._root.getiterator(tag)
596
597 ##
598 # Finds the first toplevel element with given tag.
599 # Same as getroot().find(path).
600 #
601 # @param path What element to look for.
602 # @return The first matching element, or None if no element was found.
603 # @defreturn Element or None
604
605 def find(self, path):
606 assert self._root is not None
607 if path[:1] == "/":
608 path = "." + path
609 return self._root.find(path)
610
611 ##
612 # Finds the element text for the first toplevel element with given
613 # tag. Same as getroot().findtext(path).
614 #
615 # @param path What toplevel element to look for.
616 # @param default What to return if the element was not found.
617 # @return The text content of the first matching element, or the
618 # default value no element was found. Note that if the element
619 # has is found, but has no text content, this method returns an
620 # empty string.
621 # @defreturn string
622
623 def findtext(self, path, default=None):
624 assert self._root is not None
625 if path[:1] == "/":
626 path = "." + path
627 return self._root.findtext(path, default)
628
629 ##
630 # Finds all toplevel elements with the given tag.
631 # Same as getroot().findall(path).
632 #
633 # @param path What element to look for.
634 # @return A list or iterator containing all matching elements,
635 # in document order.
636 # @defreturn list of Element instances
637
638 def findall(self, path):
639 assert self._root is not None
640 if path[:1] == "/":
641 path = "." + path
642 return self._root.findall(path)
643
644 ##
645 # Writes the element tree to a file, as XML.
646 #
647 # @param file A file name, or a file object opened for writing.
Guido van Rossum34d19282007-08-09 01:03:29 +0000648 # @param encoding Optional output encoding (default is None)
Armin Rigo9ed73062005-12-14 18:10:45 +0000649
Guido van Rossum34d19282007-08-09 01:03:29 +0000650 def write(self, file, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +0000651 assert self._root is not None
652 if not hasattr(file, "write"):
Guido van Rossum34d19282007-08-09 01:03:29 +0000653 if encoding:
654 file = open(file, "wb")
655 else:
656 file = open(file, "w")
657 if encoding and encoding != "utf-8":
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000658 file.write(_encode("<?xml version='1.0' encoding='%s'?>\n" % encoding, encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000659 self._write(file, self._root, encoding, {})
660
661 def _write(self, file, node, encoding, namespaces):
662 # write XML to file
663 tag = node.tag
664 if tag is Comment:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000665 file.write(_encode("<!-- %s -->" % _escape_cdata(node.text), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000666 elif tag is ProcessingInstruction:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000667 file.write(_encode("<?%s?>" % _escape_cdata(node.text), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000668 else:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000669 items = list(node.items())
Armin Rigo9ed73062005-12-14 18:10:45 +0000670 xmlns_items = [] # new namespaces in this scope
671 try:
672 if isinstance(tag, QName) or tag[:1] == "{":
673 tag, xmlns = fixtag(tag, namespaces)
674 if xmlns: xmlns_items.append(xmlns)
675 except TypeError:
676 _raise_serialization_error(tag)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000677 file.write(_encode("<" + tag, encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000678 if items or xmlns_items:
679 items.sort() # lexical order
680 for k, v in items:
681 try:
682 if isinstance(k, QName) or k[:1] == "{":
683 k, xmlns = fixtag(k, namespaces)
684 if xmlns: xmlns_items.append(xmlns)
685 except TypeError:
686 _raise_serialization_error(k)
687 try:
688 if isinstance(v, QName):
689 v, xmlns = fixtag(v, namespaces)
690 if xmlns: xmlns_items.append(xmlns)
691 except TypeError:
692 _raise_serialization_error(v)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000693 file.write(_encode(" %s=\"%s\"" % (k, _escape_attrib(v)), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000694 for k, v in xmlns_items:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000695 file.write(_encode(" %s=\"%s\"" % (k, _escape_attrib(v)), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000696 if node.text or len(node):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000697 file.write(_encode(">", encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000698 if node.text:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000699 file.write(_encode(_escape_cdata(node.text), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000700 for n in node:
701 self._write(file, n, encoding, namespaces)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000702 file.write(_encode("</" + tag + ">", encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000703 else:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000704 file.write(_encode(" />", encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000705 for k, v in xmlns_items:
706 del namespaces[v]
707 if node.tail:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000708 file.write(_encode(_escape_cdata(node.tail), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000709
710# --------------------------------------------------------------------
711# helpers
712
713##
714# Checks if an object appears to be a valid element object.
715#
716# @param An element instance.
717# @return A true value if this is an element object.
718# @defreturn flag
719
720def iselement(element):
721 # FIXME: not sure about this; might be a better idea to look
722 # for tag/attrib/text attributes
723 return isinstance(element, _ElementInterface) or hasattr(element, "tag")
724
725##
726# Writes an element tree or element structure to sys.stdout. This
727# function should be used for debugging only.
728# <p>
729# The exact output format is implementation dependent. In this
730# version, it's written as an ordinary XML file.
731#
732# @param elem An element tree or an individual element.
733
734def dump(elem):
735 # debugging
736 if not isinstance(elem, ElementTree):
737 elem = ElementTree(elem)
738 elem.write(sys.stdout)
739 tail = elem.getroot().tail
740 if not tail or tail[-1] != "\n":
741 sys.stdout.write("\n")
742
743def _encode(s, encoding):
Guido van Rossum34d19282007-08-09 01:03:29 +0000744 if encoding:
Armin Rigo9ed73062005-12-14 18:10:45 +0000745 return s.encode(encoding)
Guido van Rossum34d19282007-08-09 01:03:29 +0000746 else:
747 return s
Armin Rigo9ed73062005-12-14 18:10:45 +0000748
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000749_escape = re.compile(r"[&<>\"\u0080-\uffff]+")
Armin Rigo9ed73062005-12-14 18:10:45 +0000750
751_escape_map = {
752 "&": "&amp;",
753 "<": "&lt;",
754 ">": "&gt;",
755 '"': "&quot;",
756}
757
758_namespace_map = {
759 # "well-known" namespace prefixes
760 "http://www.w3.org/XML/1998/namespace": "xml",
761 "http://www.w3.org/1999/xhtml": "html",
762 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
763 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
764}
765
766def _raise_serialization_error(text):
767 raise TypeError(
768 "cannot serialize %r (type %s)" % (text, type(text).__name__)
769 )
770
771def _encode_entity(text, pattern=_escape):
772 # map reserved and non-ascii characters to numerical entities
773 def escape_entities(m, map=_escape_map):
774 out = []
775 append = out.append
776 for char in m.group():
777 text = map.get(char)
778 if text is None:
779 text = "&#%d;" % ord(char)
780 append(text)
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000781 return "".join(out)
Armin Rigo9ed73062005-12-14 18:10:45 +0000782 try:
783 return _encode(pattern.sub(escape_entities, text), "ascii")
784 except TypeError:
785 _raise_serialization_error(text)
786
787#
788# the following functions assume an ascii-compatible encoding
789# (or "utf-16")
790
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000791def _escape_cdata(text):
Armin Rigo9ed73062005-12-14 18:10:45 +0000792 # escape character data
793 try:
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000794 text = text.replace("&", "&amp;")
795 text = text.replace("<", "&lt;")
796 text = text.replace(">", "&gt;")
Armin Rigo9ed73062005-12-14 18:10:45 +0000797 return text
798 except (TypeError, AttributeError):
799 _raise_serialization_error(text)
800
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000801def _escape_attrib(text):
Armin Rigo9ed73062005-12-14 18:10:45 +0000802 # escape attribute value
803 try:
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000804 text = text.replace("&", "&amp;")
805 text = text.replace("'", "&apos;") # FIXME: overkill
806 text = text.replace("\"", "&quot;")
807 text = text.replace("<", "&lt;")
808 text = text.replace(">", "&gt;")
Armin Rigo9ed73062005-12-14 18:10:45 +0000809 return text
810 except (TypeError, AttributeError):
811 _raise_serialization_error(text)
812
813def fixtag(tag, namespaces):
814 # given a decorated tag (of the form {uri}tag), return prefixed
815 # tag and namespace declaration, if any
816 if isinstance(tag, QName):
817 tag = tag.text
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000818 namespace_uri, tag = tag[1:].split("}", 1)
Armin Rigo9ed73062005-12-14 18:10:45 +0000819 prefix = namespaces.get(namespace_uri)
820 if prefix is None:
821 prefix = _namespace_map.get(namespace_uri)
822 if prefix is None:
823 prefix = "ns%d" % len(namespaces)
824 namespaces[namespace_uri] = prefix
825 if prefix == "xml":
826 xmlns = None
827 else:
828 xmlns = ("xmlns:%s" % prefix, namespace_uri)
829 else:
830 xmlns = None
831 return "%s:%s" % (prefix, tag), xmlns
832
833##
834# Parses an XML document into an element tree.
835#
836# @param source A filename or file object containing XML data.
837# @param parser An optional parser instance. If not given, the
838# standard {@link XMLTreeBuilder} parser is used.
839# @return An ElementTree instance
840
841def parse(source, parser=None):
842 tree = ElementTree()
843 tree.parse(source, parser)
844 return tree
845
846##
847# Parses an XML document into an element tree incrementally, and reports
848# what's going on to the user.
849#
850# @param source A filename or file object containing XML data.
851# @param events A list of events to report back. If omitted, only "end"
852# events are reported.
853# @return A (event, elem) iterator.
854
855class iterparse:
856
857 def __init__(self, source, events=None):
858 if not hasattr(source, "read"):
859 source = open(source, "rb")
860 self._file = source
861 self._events = []
862 self._index = 0
863 self.root = self._root = None
864 self._parser = XMLTreeBuilder()
865 # wire up the parser for event reporting
866 parser = self._parser._parser
867 append = self._events.append
868 if events is None:
869 events = ["end"]
870 for event in events:
871 if event == "start":
872 try:
873 parser.ordered_attributes = 1
874 parser.specified_attributes = 1
875 def handler(tag, attrib_in, event=event, append=append,
876 start=self._parser._start_list):
877 append((event, start(tag, attrib_in)))
878 parser.StartElementHandler = handler
879 except AttributeError:
880 def handler(tag, attrib_in, event=event, append=append,
881 start=self._parser._start):
882 append((event, start(tag, attrib_in)))
883 parser.StartElementHandler = handler
884 elif event == "end":
885 def handler(tag, event=event, append=append,
886 end=self._parser._end):
887 append((event, end(tag)))
888 parser.EndElementHandler = handler
889 elif event == "start-ns":
890 def handler(prefix, uri, event=event, append=append):
891 try:
892 uri = _encode(uri, "ascii")
893 except UnicodeError:
894 pass
895 append((event, (prefix or "", uri)))
896 parser.StartNamespaceDeclHandler = handler
897 elif event == "end-ns":
898 def handler(prefix, event=event, append=append):
899 append((event, None))
900 parser.EndNamespaceDeclHandler = handler
901
Georg Brandla18af4e2007-04-21 15:47:16 +0000902 def __next__(self):
Armin Rigo9ed73062005-12-14 18:10:45 +0000903 while 1:
904 try:
905 item = self._events[self._index]
906 except IndexError:
907 if self._parser is None:
908 self.root = self._root
909 try:
910 raise StopIteration
911 except NameError:
912 raise IndexError
913 # load event buffer
914 del self._events[:]
915 self._index = 0
916 data = self._file.read(16384)
917 if data:
918 self._parser.feed(data)
919 else:
920 self._root = self._parser.close()
921 self._parser = None
922 else:
923 self._index = self._index + 1
924 return item
925
926 try:
927 iter
928 def __iter__(self):
929 return self
930 except NameError:
931 def __getitem__(self, index):
Georg Brandla18af4e2007-04-21 15:47:16 +0000932 return self.__next__()
Armin Rigo9ed73062005-12-14 18:10:45 +0000933
934##
935# Parses an XML document from a string constant. This function can
936# be used to embed "XML literals" in Python code.
937#
938# @param source A string containing XML data.
939# @return An Element instance.
940# @defreturn Element
941
942def XML(text):
943 parser = XMLTreeBuilder()
944 parser.feed(text)
945 return parser.close()
946
947##
948# Parses an XML document from a string constant, and also returns
949# a dictionary which maps from element id:s to elements.
950#
951# @param source A string containing XML data.
952# @return A tuple containing an Element instance and a dictionary.
953# @defreturn (Element, dictionary)
954
955def XMLID(text):
956 parser = XMLTreeBuilder()
957 parser.feed(text)
958 tree = parser.close()
959 ids = {}
960 for elem in tree.getiterator():
961 id = elem.get("id")
962 if id:
963 ids[id] = elem
964 return tree, ids
965
966##
967# Parses an XML document from a string constant. Same as {@link #XML}.
968#
969# @def fromstring(text)
970# @param source A string containing XML data.
971# @return An Element instance.
972# @defreturn Element
973
974fromstring = XML
975
976##
977# Generates a string representation of an XML element, including all
Guido van Rossum34d19282007-08-09 01:03:29 +0000978# subelements. If encoding is None, the return type is a string;
979# otherwise it is a bytes array.
Armin Rigo9ed73062005-12-14 18:10:45 +0000980#
981# @param element An Element instance.
Guido van Rossum34d19282007-08-09 01:03:29 +0000982# @return An (optionally) encoded string containing the XML data.
Armin Rigo9ed73062005-12-14 18:10:45 +0000983# @defreturn string
984
985def tostring(element, encoding=None):
986 class dummy:
987 pass
988 data = []
989 file = dummy()
990 file.write = data.append
991 ElementTree(element).write(file, encoding)
Guido van Rossum34d19282007-08-09 01:03:29 +0000992 if encoding:
993 return b"".join(data)
994 else:
995 return "".join(data)
Armin Rigo9ed73062005-12-14 18:10:45 +0000996
997##
998# Generic element structure builder. This builder converts a sequence
999# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1000# #TreeBuilder.end} method calls to a well-formed element structure.
1001# <p>
1002# You can use this class to build an element structure using a custom XML
1003# parser, or a parser for some other XML-like format.
1004#
1005# @param element_factory Optional element factory. This factory
1006# is called to create new Element instances, as necessary.
1007
1008class TreeBuilder:
1009
1010 def __init__(self, element_factory=None):
1011 self._data = [] # data collector
1012 self._elem = [] # element stack
1013 self._last = None # last element
1014 self._tail = None # true if we're after an end tag
1015 if element_factory is None:
1016 element_factory = _ElementInterface
1017 self._factory = element_factory
1018
1019 ##
1020 # Flushes the parser buffers, and returns the toplevel documen
1021 # element.
1022 #
1023 # @return An Element instance.
1024 # @defreturn Element
1025
1026 def close(self):
1027 assert len(self._elem) == 0, "missing end tags"
1028 assert self._last != None, "missing toplevel element"
1029 return self._last
1030
1031 def _flush(self):
1032 if self._data:
1033 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001034 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001035 if self._tail:
1036 assert self._last.tail is None, "internal error (tail)"
1037 self._last.tail = text
1038 else:
1039 assert self._last.text is None, "internal error (text)"
1040 self._last.text = text
1041 self._data = []
1042
1043 ##
1044 # Adds text to the current element.
1045 #
1046 # @param data A string. This should be either an 8-bit string
1047 # containing ASCII text, or a Unicode string.
1048
1049 def data(self, data):
1050 self._data.append(data)
1051
1052 ##
1053 # Opens a new element.
1054 #
1055 # @param tag The element name.
1056 # @param attrib A dictionary containing element attributes.
1057 # @return The opened element.
1058 # @defreturn Element
1059
1060 def start(self, tag, attrs):
1061 self._flush()
1062 self._last = elem = self._factory(tag, attrs)
1063 if self._elem:
1064 self._elem[-1].append(elem)
1065 self._elem.append(elem)
1066 self._tail = 0
1067 return elem
1068
1069 ##
1070 # Closes the current element.
1071 #
1072 # @param tag The element name.
1073 # @return The closed element.
1074 # @defreturn Element
1075
1076 def end(self, tag):
1077 self._flush()
1078 self._last = self._elem.pop()
1079 assert self._last.tag == tag,\
1080 "end tag mismatch (expected %s, got %s)" % (
1081 self._last.tag, tag)
1082 self._tail = 1
1083 return self._last
1084
1085##
1086# Element structure builder for XML source data, based on the
1087# <b>expat</b> parser.
1088#
1089# @keyparam target Target object. If omitted, the builder uses an
1090# instance of the standard {@link #TreeBuilder} class.
1091# @keyparam html Predefine HTML entities. This flag is not supported
1092# by the current implementation.
1093# @see #ElementTree
1094# @see #TreeBuilder
1095
1096class XMLTreeBuilder:
1097
1098 def __init__(self, html=0, target=None):
1099 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001100 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001101 except ImportError:
1102 raise ImportError(
1103 "No module named expat; use SimpleXMLTreeBuilder instead"
1104 )
1105 self._parser = parser = expat.ParserCreate(None, "}")
1106 if target is None:
1107 target = TreeBuilder()
1108 self._target = target
1109 self._names = {} # name memo cache
1110 # callbacks
1111 parser.DefaultHandlerExpand = self._default
1112 parser.StartElementHandler = self._start
1113 parser.EndElementHandler = self._end
1114 parser.CharacterDataHandler = self._data
1115 # let expat do the buffering, if supported
1116 try:
1117 self._parser.buffer_text = 1
1118 except AttributeError:
1119 pass
1120 # use new-style attribute handling, if supported
1121 try:
1122 self._parser.ordered_attributes = 1
1123 self._parser.specified_attributes = 1
1124 parser.StartElementHandler = self._start_list
1125 except AttributeError:
1126 pass
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001127 encoding = "utf-8"
Armin Rigo9ed73062005-12-14 18:10:45 +00001128 # target.xml(encoding, None)
1129 self._doctype = None
1130 self.entity = {}
1131
Armin Rigo9ed73062005-12-14 18:10:45 +00001132 def _fixname(self, key):
1133 # expand qname, and convert name string to ascii, if possible
1134 try:
1135 name = self._names[key]
1136 except KeyError:
1137 name = key
1138 if "}" in name:
1139 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001140 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001141 return name
1142
1143 def _start(self, tag, attrib_in):
1144 fixname = self._fixname
1145 tag = fixname(tag)
1146 attrib = {}
1147 for key, value in attrib_in.items():
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001148 attrib[fixname(key)] = value
Armin Rigo9ed73062005-12-14 18:10:45 +00001149 return self._target.start(tag, attrib)
1150
1151 def _start_list(self, tag, attrib_in):
1152 fixname = self._fixname
1153 tag = fixname(tag)
1154 attrib = {}
1155 if attrib_in:
1156 for i in range(0, len(attrib_in), 2):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001157 attrib[fixname(attrib_in[i])] = attrib_in[i+1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001158 return self._target.start(tag, attrib)
1159
1160 def _data(self, text):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001161 return self._target.data(text)
Armin Rigo9ed73062005-12-14 18:10:45 +00001162
1163 def _end(self, tag):
1164 return self._target.end(self._fixname(tag))
1165
1166 def _default(self, text):
1167 prefix = text[:1]
1168 if prefix == "&":
1169 # deal with undefined entities
1170 try:
1171 self._target.data(self.entity[text[1:-1]])
1172 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001173 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001174 raise expat.error(
1175 "undefined entity %s: line %d, column %d" %
1176 (text, self._parser.ErrorLineNumber,
1177 self._parser.ErrorColumnNumber)
1178 )
1179 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1180 self._doctype = [] # inside a doctype declaration
1181 elif self._doctype is not None:
1182 # parse doctype contents
1183 if prefix == ">":
1184 self._doctype = None
1185 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001186 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001187 if not text:
1188 return
1189 self._doctype.append(text)
1190 n = len(self._doctype)
1191 if n > 2:
1192 type = self._doctype[1]
1193 if type == "PUBLIC" and n == 4:
1194 name, type, pubid, system = self._doctype
1195 elif type == "SYSTEM" and n == 3:
1196 name, type, system = self._doctype
1197 pubid = None
1198 else:
1199 return
1200 if pubid:
1201 pubid = pubid[1:-1]
1202 self.doctype(name, pubid, system[1:-1])
1203 self._doctype = None
1204
1205 ##
1206 # Handles a doctype declaration.
1207 #
1208 # @param name Doctype name.
1209 # @param pubid Public identifier.
1210 # @param system System identifier.
1211
1212 def doctype(self, name, pubid, system):
1213 pass
1214
1215 ##
1216 # Feeds data to the parser.
1217 #
1218 # @param data Encoded data.
1219
1220 def feed(self, data):
1221 self._parser.Parse(data, 0)
1222
1223 ##
1224 # Finishes feeding data to the parser.
1225 #
1226 # @return An element structure.
1227 # @defreturn Element
1228
1229 def close(self):
1230 self._parser.Parse("", 1) # end of data
1231 tree = self._target.close()
1232 del self._target, self._parser # get rid of circular references
1233 return tree
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001234
1235# compatibility
1236XMLParser = XMLTreeBuilder