blob: 7dbc72e78f958971268f4eec3ddbbbced2ee0070 [file] [log] [blame]
Armin Rigo9ed73062005-12-14 18:10:45 +00001#
2# ElementTree
3# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
4#
5# light-weight XML support for Python 1.5.2 and later.
6#
7# history:
8# 2001-10-20 fl created (from various sources)
9# 2001-11-01 fl return root from parse method
10# 2002-02-16 fl sort attributes in lexical order
11# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
12# 2002-05-01 fl finished TreeBuilder refactoring
13# 2002-07-14 fl added basic namespace support to ElementTree.write
14# 2002-07-25 fl added QName attribute support
15# 2002-10-20 fl fixed encoding in write
16# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
17# 2002-11-27 fl accept file objects or file names for parse/write
18# 2002-12-04 fl moved XMLTreeBuilder back to this module
19# 2003-01-11 fl fixed entity encoding glitch for us-ascii
20# 2003-02-13 fl added XML literal factory
21# 2003-02-21 fl added ProcessingInstruction/PI factory
22# 2003-05-11 fl added tostring/fromstring helpers
23# 2003-05-26 fl added ElementPath support
24# 2003-07-05 fl added makeelement factory method
25# 2003-07-28 fl added more well-known namespace prefixes
26# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
27# 2003-09-04 fl fall back on emulator if ElementPath is not installed
28# 2003-10-31 fl markup updates
29# 2003-11-15 fl fixed nested namespace bug
30# 2004-03-28 fl added XMLID helper
31# 2004-06-02 fl added default support to findtext
32# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
33# 2004-08-23 fl take advantage of post-2.1 expat features
34# 2005-02-01 fl added iterparse implementation
35# 2005-03-02 fl fixed iterparse support for pre-2.2 versions
36#
37# Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved.
38#
39# fredrik@pythonware.com
40# http://www.pythonware.com
41#
42# --------------------------------------------------------------------
43# The ElementTree toolkit is
44#
45# Copyright (c) 1999-2005 by Fredrik Lundh
46#
47# By obtaining, using, and/or copying this software and/or its
48# associated documentation, you agree that you have read, understood,
49# and will comply with the following terms and conditions:
50#
51# Permission to use, copy, modify, and distribute this software and
52# its associated documentation for any purpose and without fee is
53# hereby granted, provided that the above copyright notice appears in
54# all copies, and that both that copyright notice and this permission
55# notice appear in supporting documentation, and that the name of
56# Secret Labs AB or the author not be used in advertising or publicity
57# pertaining to distribution of the software without specific, written
58# prior permission.
59#
60# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
61# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
62# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
63# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
64# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
65# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
66# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
67# OF THIS SOFTWARE.
68# --------------------------------------------------------------------
69
Fredrik Lundh63168a52005-12-14 22:29:34 +000070# Licensed to PSF under a Contributor Agreement.
71# See http://www.python.org/2.4/license for licensing details.
72
Armin Rigo9ed73062005-12-14 18:10:45 +000073__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
78 "fromstring",
79 "iselement", "iterparse",
80 "parse",
81 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
84 "tostring",
85 "TreeBuilder",
86 "VERSION", "XML",
Fredrik Lundhbf84e542006-07-06 12:29:24 +000087 "XMLParser", "XMLTreeBuilder",
Armin Rigo9ed73062005-12-14 18:10:45 +000088 ]
89
90##
91# The <b>Element</b> type is a flexible container object, designed to
92# store hierarchical data structures in memory. The type can be
93# described as a cross between a list and a dictionary.
94# <p>
95# Each element has a number of properties associated with it:
96# <ul>
97# <li>a <i>tag</i>. This is a string identifying what kind of data
98# this element represents (the element type, in other words).</li>
99# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
100# <li>a <i>text</i> string.</li>
101# <li>an optional <i>tail</i> string.</li>
102# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
103# </ul>
104#
105# To create an element instance, use the {@link #Element} or {@link
106# #SubElement} factory functions.
107# <p>
108# The {@link #ElementTree} class can be used to wrap an element
109# structure, and convert it from and to XML.
110##
111
112import string, sys, re
113
114class _SimpleElementPath:
115 # emulate pre-1.2 find/findtext/findall behaviour
116 def find(self, element, tag):
117 for elem in element:
118 if elem.tag == tag:
119 return elem
120 return None
121 def findtext(self, element, tag, default=None):
122 for elem in element:
123 if elem.tag == tag:
124 return elem.text or ""
125 return default
126 def findall(self, element, tag):
127 if tag[:3] == ".//":
128 return element.getiterator(tag[3:])
129 result = []
130 for elem in element:
131 if elem.tag == tag:
132 result.append(elem)
133 return result
134
135try:
136 import ElementPath
137except ImportError:
138 # FIXME: issue warning in this case?
139 ElementPath = _SimpleElementPath()
140
141# TODO: add support for custom namespace resolvers/default namespaces
142# TODO: add improved support for incremental parsing
143
144VERSION = "1.2.6"
145
146##
147# Internal element class. This class defines the Element interface,
148# and provides a reference implementation of this interface.
149# <p>
150# You should not create instances of this class directly. Use the
151# appropriate factory functions instead, such as {@link #Element}
152# and {@link #SubElement}.
153#
154# @see Element
155# @see SubElement
156# @see Comment
157# @see ProcessingInstruction
158
159class _ElementInterface:
160 # <tag attrib>text<child/>...</tag>tail
161
162 ##
163 # (Attribute) Element tag.
164
165 tag = None
166
167 ##
168 # (Attribute) Element attribute dictionary. Where possible, use
169 # {@link #_ElementInterface.get},
170 # {@link #_ElementInterface.set},
171 # {@link #_ElementInterface.keys}, and
172 # {@link #_ElementInterface.items} to access
173 # element attributes.
174
175 attrib = None
176
177 ##
178 # (Attribute) Text before first subelement. This is either a
179 # string or the value None, if there was no text.
180
181 text = None
182
183 ##
184 # (Attribute) Text after this element's end tag, but before the
185 # next sibling element's start tag. This is either a string or
186 # the value None, if there was no text.
187
188 tail = None # text after end tag, if any
189
190 def __init__(self, tag, attrib):
191 self.tag = tag
192 self.attrib = attrib
193 self._children = []
194
195 def __repr__(self):
196 return "<Element %s at %x>" % (self.tag, id(self))
197
198 ##
199 # Creates a new element object of the same type as this element.
200 #
201 # @param tag Element tag.
202 # @param attrib Element attributes, given as a dictionary.
203 # @return A new element instance.
204
205 def makeelement(self, tag, attrib):
206 return Element(tag, attrib)
207
208 ##
209 # Returns the number of subelements.
210 #
211 # @return The number of subelements.
212
213 def __len__(self):
214 return len(self._children)
215
216 ##
217 # Returns the given subelement.
218 #
219 # @param index What subelement to return.
220 # @return The given subelement.
221 # @exception IndexError If the given element does not exist.
222
223 def __getitem__(self, index):
224 return self._children[index]
225
226 ##
227 # Replaces the given subelement.
228 #
229 # @param index What subelement to replace.
230 # @param element The new element value.
231 # @exception IndexError If the given element does not exist.
232 # @exception AssertionError If element is not a valid object.
233
234 def __setitem__(self, index, element):
235 assert iselement(element)
236 self._children[index] = element
237
238 ##
239 # Deletes the given subelement.
240 #
241 # @param index What subelement to delete.
242 # @exception IndexError If the given element does not exist.
243
244 def __delitem__(self, index):
245 del self._children[index]
246
247 ##
248 # Returns a list containing subelements in the given range.
249 #
250 # @param start The first subelement to return.
251 # @param stop The first subelement that shouldn't be returned.
252 # @return A sequence object containing subelements.
253
254 def __getslice__(self, start, stop):
255 return self._children[start:stop]
256
257 ##
258 # Replaces a number of subelements with elements from a sequence.
259 #
260 # @param start The first subelement to replace.
261 # @param stop The first subelement that shouldn't be replaced.
262 # @param elements A sequence object with zero or more elements.
263 # @exception AssertionError If a sequence member is not a valid object.
264
265 def __setslice__(self, start, stop, elements):
266 for element in elements:
267 assert iselement(element)
268 self._children[start:stop] = list(elements)
269
270 ##
271 # Deletes a number of subelements.
272 #
273 # @param start The first subelement to delete.
274 # @param stop The first subelement to leave in there.
275
276 def __delslice__(self, start, stop):
277 del self._children[start:stop]
278
279 ##
280 # Adds a subelement to the end of this element.
281 #
282 # @param element The element to add.
283 # @exception AssertionError If a sequence member is not a valid object.
284
285 def append(self, element):
286 assert iselement(element)
287 self._children.append(element)
288
289 ##
290 # Inserts a subelement at the given position in this element.
291 #
292 # @param index Where to insert the new subelement.
293 # @exception AssertionError If the element is not a valid object.
294
295 def insert(self, index, element):
296 assert iselement(element)
297 self._children.insert(index, element)
298
299 ##
300 # Removes a matching subelement. Unlike the <b>find</b> methods,
301 # this method compares elements based on identity, not on tag
302 # value or contents.
303 #
304 # @param element What element to remove.
305 # @exception ValueError If a matching element could not be found.
306 # @exception AssertionError If the element is not a valid object.
307
308 def remove(self, element):
309 assert iselement(element)
310 self._children.remove(element)
311
312 ##
313 # Returns all subelements. The elements are returned in document
314 # order.
315 #
316 # @return A list of subelements.
317 # @defreturn list of Element instances
318
319 def getchildren(self):
320 return self._children
321
322 ##
323 # Finds the first matching subelement, by tag name or path.
324 #
325 # @param path What element to look for.
326 # @return The first matching element, or None if no element was found.
327 # @defreturn Element or None
328
329 def find(self, path):
330 return ElementPath.find(self, path)
331
332 ##
333 # Finds text for the first matching subelement, by tag name or path.
334 #
335 # @param path What element to look for.
336 # @param default What to return if the element was not found.
337 # @return The text content of the first matching element, or the
338 # default value no element was found. Note that if the element
339 # has is found, but has no text content, this method returns an
340 # empty string.
341 # @defreturn string
342
343 def findtext(self, path, default=None):
344 return ElementPath.findtext(self, path, default)
345
346 ##
347 # Finds all matching subelements, by tag name or path.
348 #
349 # @param path What element to look for.
350 # @return A list or iterator containing all matching elements,
351 # in document order.
352 # @defreturn list of Element instances
353
354 def findall(self, path):
355 return ElementPath.findall(self, path)
356
357 ##
358 # Resets an element. This function removes all subelements, clears
359 # all attributes, and sets the text and tail attributes to None.
360
361 def clear(self):
362 self.attrib.clear()
363 self._children = []
364 self.text = self.tail = None
365
366 ##
367 # Gets an element attribute.
368 #
369 # @param key What attribute to look for.
370 # @param default What to return if the attribute was not found.
371 # @return The attribute value, or the default value, if the
372 # attribute was not found.
373 # @defreturn string or None
374
375 def get(self, key, default=None):
376 return self.attrib.get(key, default)
377
378 ##
379 # Sets an element attribute.
380 #
381 # @param key What attribute to set.
382 # @param value The attribute value.
383
384 def set(self, key, value):
385 self.attrib[key] = value
386
387 ##
388 # Gets a list of attribute names. The names are returned in an
389 # arbitrary order (just like for an ordinary Python dictionary).
390 #
391 # @return A list of element attribute names.
392 # @defreturn list of strings
393
394 def keys(self):
395 return self.attrib.keys()
396
397 ##
398 # Gets element attributes, as a sequence. The attributes are
399 # returned in an arbitrary order.
400 #
401 # @return A list of (name, value) tuples for all attributes.
402 # @defreturn list of (string, string) tuples
403
404 def items(self):
405 return self.attrib.items()
406
407 ##
408 # Creates a tree iterator. The iterator loops over this element
409 # and all subelements, in document order, and returns all elements
410 # with a matching tag.
411 # <p>
412 # If the tree structure is modified during iteration, the result
413 # is undefined.
414 #
415 # @param tag What tags to look for (default is to return all elements).
416 # @return A list or iterator containing all the matching elements.
417 # @defreturn list or iterator
418
419 def getiterator(self, tag=None):
420 nodes = []
421 if tag == "*":
422 tag = None
423 if tag is None or self.tag == tag:
424 nodes.append(self)
425 for node in self._children:
426 nodes.extend(node.getiterator(tag))
427 return nodes
428
429# compatibility
430_Element = _ElementInterface
431
432##
433# Element factory. This function returns an object implementing the
434# standard Element interface. The exact class or type of that object
435# is implementation dependent, but it will always be compatible with
436# the {@link #_ElementInterface} class in this module.
437# <p>
438# The element name, attribute names, and attribute values can be
439# either 8-bit ASCII strings or Unicode strings.
440#
441# @param tag The element name.
442# @param attrib An optional dictionary, containing element attributes.
443# @param **extra Additional attributes, given as keyword arguments.
444# @return An element instance.
445# @defreturn Element
446
447def Element(tag, attrib={}, **extra):
448 attrib = attrib.copy()
449 attrib.update(extra)
450 return _ElementInterface(tag, attrib)
451
452##
453# Subelement factory. This function creates an element instance, and
454# appends it to an existing element.
455# <p>
456# The element name, attribute names, and attribute values can be
457# either 8-bit ASCII strings or Unicode strings.
458#
459# @param parent The parent element.
460# @param tag The subelement name.
461# @param attrib An optional dictionary, containing element attributes.
462# @param **extra Additional attributes, given as keyword arguments.
463# @return An element instance.
464# @defreturn Element
465
466def SubElement(parent, tag, attrib={}, **extra):
467 attrib = attrib.copy()
468 attrib.update(extra)
469 element = parent.makeelement(tag, attrib)
470 parent.append(element)
471 return element
472
473##
474# Comment element factory. This factory function creates a special
475# element that will be serialized as an XML comment.
476# <p>
477# The comment string can be either an 8-bit ASCII string or a Unicode
478# string.
479#
480# @param text A string containing the comment string.
481# @return An element instance, representing a comment.
482# @defreturn Element
483
484def Comment(text=None):
485 element = Element(Comment)
486 element.text = text
487 return element
488
489##
490# PI element factory. This factory function creates a special element
491# that will be serialized as an XML processing instruction.
492#
493# @param target A string containing the PI target.
494# @param text A string containing the PI contents, if any.
495# @return An element instance, representing a PI.
496# @defreturn Element
497
498def ProcessingInstruction(target, text=None):
499 element = Element(ProcessingInstruction)
500 element.text = target
501 if text:
502 element.text = element.text + " " + text
503 return element
504
505PI = ProcessingInstruction
506
507##
508# QName wrapper. This can be used to wrap a QName attribute value, in
509# order to get proper namespace handling on output.
510#
511# @param text A string containing the QName value, in the form {uri}local,
512# or, if the tag argument is given, the URI part of a QName.
513# @param tag Optional tag. If given, the first argument is interpreted as
514# an URI, and this argument is interpreted as a local name.
515# @return An opaque object, representing the QName.
516
517class QName:
518 def __init__(self, text_or_uri, tag=None):
519 if tag:
520 text_or_uri = "{%s}%s" % (text_or_uri, tag)
521 self.text = text_or_uri
522 def __str__(self):
523 return self.text
524 def __hash__(self):
525 return hash(self.text)
526 def __cmp__(self, other):
527 if isinstance(other, QName):
528 return cmp(self.text, other.text)
529 return cmp(self.text, other)
530
531##
532# ElementTree wrapper class. This class represents an entire element
533# hierarchy, and adds some extra support for serialization to and from
534# standard XML.
535#
536# @param element Optional root element.
537# @keyparam file Optional file handle or name. If given, the
538# tree is initialized with the contents of this XML file.
539
540class ElementTree:
541
542 def __init__(self, element=None, file=None):
543 assert element is None or iselement(element)
544 self._root = element # first node
545 if file:
546 self.parse(file)
547
548 ##
549 # Gets the root element for this tree.
550 #
551 # @return An element instance.
552 # @defreturn Element
553
554 def getroot(self):
555 return self._root
556
557 ##
558 # Replaces the root element for this tree. This discards the
559 # current contents of the tree, and replaces it with the given
560 # element. Use with care.
561 #
562 # @param element An element instance.
563
564 def _setroot(self, element):
565 assert iselement(element)
566 self._root = element
567
568 ##
569 # Loads an external XML document into this element tree.
570 #
571 # @param source A file name or file object.
572 # @param parser An optional parser instance. If not given, the
573 # standard {@link XMLTreeBuilder} parser is used.
574 # @return The document root element.
575 # @defreturn Element
576
577 def parse(self, source, parser=None):
578 if not hasattr(source, "read"):
579 source = open(source, "rb")
580 if not parser:
581 parser = XMLTreeBuilder()
582 while 1:
583 data = source.read(32768)
584 if not data:
585 break
586 parser.feed(data)
587 self._root = parser.close()
588 return self._root
589
590 ##
591 # Creates a tree iterator for the root element. The iterator loops
592 # over all elements in this tree, in document order.
593 #
594 # @param tag What tags to look for (default is to return all elements)
595 # @return An iterator.
596 # @defreturn iterator
597
598 def getiterator(self, tag=None):
599 assert self._root is not None
600 return self._root.getiterator(tag)
601
602 ##
603 # Finds the first toplevel element with given tag.
604 # Same as getroot().find(path).
605 #
606 # @param path What element to look for.
607 # @return The first matching element, or None if no element was found.
608 # @defreturn Element or None
609
610 def find(self, path):
611 assert self._root is not None
612 if path[:1] == "/":
613 path = "." + path
614 return self._root.find(path)
615
616 ##
617 # Finds the element text for the first toplevel element with given
618 # tag. Same as getroot().findtext(path).
619 #
620 # @param path What toplevel element to look for.
621 # @param default What to return if the element was not found.
622 # @return The text content of the first matching element, or the
623 # default value no element was found. Note that if the element
624 # has is found, but has no text content, this method returns an
625 # empty string.
626 # @defreturn string
627
628 def findtext(self, path, default=None):
629 assert self._root is not None
630 if path[:1] == "/":
631 path = "." + path
632 return self._root.findtext(path, default)
633
634 ##
635 # Finds all toplevel elements with the given tag.
636 # Same as getroot().findall(path).
637 #
638 # @param path What element to look for.
639 # @return A list or iterator containing all matching elements,
640 # in document order.
641 # @defreturn list of Element instances
642
643 def findall(self, path):
644 assert self._root is not None
645 if path[:1] == "/":
646 path = "." + path
647 return self._root.findall(path)
648
649 ##
650 # Writes the element tree to a file, as XML.
651 #
652 # @param file A file name, or a file object opened for writing.
653 # @param encoding Optional output encoding (default is US-ASCII).
654
655 def write(self, file, encoding="us-ascii"):
656 assert self._root is not None
657 if not hasattr(file, "write"):
658 file = open(file, "wb")
659 if not encoding:
660 encoding = "us-ascii"
661 elif encoding != "utf-8" and encoding != "us-ascii":
662 file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
663 self._write(file, self._root, encoding, {})
664
665 def _write(self, file, node, encoding, namespaces):
666 # write XML to file
667 tag = node.tag
668 if tag is Comment:
669 file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
670 elif tag is ProcessingInstruction:
671 file.write("<?%s?>" % _escape_cdata(node.text, encoding))
672 else:
673 items = node.items()
674 xmlns_items = [] # new namespaces in this scope
675 try:
676 if isinstance(tag, QName) or tag[:1] == "{":
677 tag, xmlns = fixtag(tag, namespaces)
678 if xmlns: xmlns_items.append(xmlns)
679 except TypeError:
680 _raise_serialization_error(tag)
681 file.write("<" + _encode(tag, encoding))
682 if items or xmlns_items:
683 items.sort() # lexical order
684 for k, v in items:
685 try:
686 if isinstance(k, QName) or k[:1] == "{":
687 k, xmlns = fixtag(k, namespaces)
688 if xmlns: xmlns_items.append(xmlns)
689 except TypeError:
690 _raise_serialization_error(k)
691 try:
692 if isinstance(v, QName):
693 v, xmlns = fixtag(v, namespaces)
694 if xmlns: xmlns_items.append(xmlns)
695 except TypeError:
696 _raise_serialization_error(v)
697 file.write(" %s=\"%s\"" % (_encode(k, encoding),
698 _escape_attrib(v, encoding)))
699 for k, v in xmlns_items:
700 file.write(" %s=\"%s\"" % (_encode(k, encoding),
701 _escape_attrib(v, encoding)))
702 if node.text or len(node):
703 file.write(">")
704 if node.text:
705 file.write(_escape_cdata(node.text, encoding))
706 for n in node:
707 self._write(file, n, encoding, namespaces)
708 file.write("</" + _encode(tag, encoding) + ">")
709 else:
710 file.write(" />")
711 for k, v in xmlns_items:
712 del namespaces[v]
713 if node.tail:
714 file.write(_escape_cdata(node.tail, encoding))
715
716# --------------------------------------------------------------------
717# helpers
718
719##
720# Checks if an object appears to be a valid element object.
721#
722# @param An element instance.
723# @return A true value if this is an element object.
724# @defreturn flag
725
726def iselement(element):
727 # FIXME: not sure about this; might be a better idea to look
728 # for tag/attrib/text attributes
729 return isinstance(element, _ElementInterface) or hasattr(element, "tag")
730
731##
732# Writes an element tree or element structure to sys.stdout. This
733# function should be used for debugging only.
734# <p>
735# The exact output format is implementation dependent. In this
736# version, it's written as an ordinary XML file.
737#
738# @param elem An element tree or an individual element.
739
740def dump(elem):
741 # debugging
742 if not isinstance(elem, ElementTree):
743 elem = ElementTree(elem)
744 elem.write(sys.stdout)
745 tail = elem.getroot().tail
746 if not tail or tail[-1] != "\n":
747 sys.stdout.write("\n")
748
749def _encode(s, encoding):
750 try:
751 return s.encode(encoding)
752 except AttributeError:
753 return s # 1.5.2: assume the string uses the right encoding
754
755if sys.version[:3] == "1.5":
756 _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
757else:
758 _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
759
760_escape_map = {
761 "&": "&amp;",
762 "<": "&lt;",
763 ">": "&gt;",
764 '"': "&quot;",
765}
766
767_namespace_map = {
768 # "well-known" namespace prefixes
769 "http://www.w3.org/XML/1998/namespace": "xml",
770 "http://www.w3.org/1999/xhtml": "html",
771 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
772 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
773}
774
775def _raise_serialization_error(text):
776 raise TypeError(
777 "cannot serialize %r (type %s)" % (text, type(text).__name__)
778 )
779
780def _encode_entity(text, pattern=_escape):
781 # map reserved and non-ascii characters to numerical entities
782 def escape_entities(m, map=_escape_map):
783 out = []
784 append = out.append
785 for char in m.group():
786 text = map.get(char)
787 if text is None:
788 text = "&#%d;" % ord(char)
789 append(text)
790 return string.join(out, "")
791 try:
792 return _encode(pattern.sub(escape_entities, text), "ascii")
793 except TypeError:
794 _raise_serialization_error(text)
795
796#
797# the following functions assume an ascii-compatible encoding
798# (or "utf-16")
799
800def _escape_cdata(text, encoding=None, replace=string.replace):
801 # escape character data
802 try:
803 if encoding:
804 try:
805 text = _encode(text, encoding)
806 except UnicodeError:
807 return _encode_entity(text)
808 text = replace(text, "&", "&amp;")
809 text = replace(text, "<", "&lt;")
810 text = replace(text, ">", "&gt;")
811 return text
812 except (TypeError, AttributeError):
813 _raise_serialization_error(text)
814
815def _escape_attrib(text, encoding=None, replace=string.replace):
816 # escape attribute value
817 try:
818 if encoding:
819 try:
820 text = _encode(text, encoding)
821 except UnicodeError:
822 return _encode_entity(text)
823 text = replace(text, "&", "&amp;")
824 text = replace(text, "'", "&apos;") # FIXME: overkill
825 text = replace(text, "\"", "&quot;")
826 text = replace(text, "<", "&lt;")
827 text = replace(text, ">", "&gt;")
828 return text
829 except (TypeError, AttributeError):
830 _raise_serialization_error(text)
831
832def fixtag(tag, namespaces):
833 # given a decorated tag (of the form {uri}tag), return prefixed
834 # tag and namespace declaration, if any
835 if isinstance(tag, QName):
836 tag = tag.text
837 namespace_uri, tag = string.split(tag[1:], "}", 1)
838 prefix = namespaces.get(namespace_uri)
839 if prefix is None:
840 prefix = _namespace_map.get(namespace_uri)
841 if prefix is None:
842 prefix = "ns%d" % len(namespaces)
843 namespaces[namespace_uri] = prefix
844 if prefix == "xml":
845 xmlns = None
846 else:
847 xmlns = ("xmlns:%s" % prefix, namespace_uri)
848 else:
849 xmlns = None
850 return "%s:%s" % (prefix, tag), xmlns
851
852##
853# Parses an XML document into an element tree.
854#
855# @param source A filename or file object containing XML data.
856# @param parser An optional parser instance. If not given, the
857# standard {@link XMLTreeBuilder} parser is used.
858# @return An ElementTree instance
859
860def parse(source, parser=None):
861 tree = ElementTree()
862 tree.parse(source, parser)
863 return tree
864
865##
866# Parses an XML document into an element tree incrementally, and reports
867# what's going on to the user.
868#
869# @param source A filename or file object containing XML data.
870# @param events A list of events to report back. If omitted, only "end"
871# events are reported.
872# @return A (event, elem) iterator.
873
874class iterparse:
875
876 def __init__(self, source, events=None):
877 if not hasattr(source, "read"):
878 source = open(source, "rb")
879 self._file = source
880 self._events = []
881 self._index = 0
882 self.root = self._root = None
883 self._parser = XMLTreeBuilder()
884 # wire up the parser for event reporting
885 parser = self._parser._parser
886 append = self._events.append
887 if events is None:
888 events = ["end"]
889 for event in events:
890 if event == "start":
891 try:
892 parser.ordered_attributes = 1
893 parser.specified_attributes = 1
894 def handler(tag, attrib_in, event=event, append=append,
895 start=self._parser._start_list):
896 append((event, start(tag, attrib_in)))
897 parser.StartElementHandler = handler
898 except AttributeError:
899 def handler(tag, attrib_in, event=event, append=append,
900 start=self._parser._start):
901 append((event, start(tag, attrib_in)))
902 parser.StartElementHandler = handler
903 elif event == "end":
904 def handler(tag, event=event, append=append,
905 end=self._parser._end):
906 append((event, end(tag)))
907 parser.EndElementHandler = handler
908 elif event == "start-ns":
909 def handler(prefix, uri, event=event, append=append):
910 try:
911 uri = _encode(uri, "ascii")
912 except UnicodeError:
913 pass
914 append((event, (prefix or "", uri)))
915 parser.StartNamespaceDeclHandler = handler
916 elif event == "end-ns":
917 def handler(prefix, event=event, append=append):
918 append((event, None))
919 parser.EndNamespaceDeclHandler = handler
920
921 def next(self):
922 while 1:
923 try:
924 item = self._events[self._index]
925 except IndexError:
926 if self._parser is None:
927 self.root = self._root
928 try:
929 raise StopIteration
930 except NameError:
931 raise IndexError
932 # load event buffer
933 del self._events[:]
934 self._index = 0
935 data = self._file.read(16384)
936 if data:
937 self._parser.feed(data)
938 else:
939 self._root = self._parser.close()
940 self._parser = None
941 else:
942 self._index = self._index + 1
943 return item
944
945 try:
946 iter
947 def __iter__(self):
948 return self
949 except NameError:
950 def __getitem__(self, index):
951 return self.next()
952
953##
954# Parses an XML document from a string constant. This function can
955# be used to embed "XML literals" in Python code.
956#
957# @param source A string containing XML data.
958# @return An Element instance.
959# @defreturn Element
960
961def XML(text):
962 parser = XMLTreeBuilder()
963 parser.feed(text)
964 return parser.close()
965
966##
967# Parses an XML document from a string constant, and also returns
968# a dictionary which maps from element id:s to elements.
969#
970# @param source A string containing XML data.
971# @return A tuple containing an Element instance and a dictionary.
972# @defreturn (Element, dictionary)
973
974def XMLID(text):
975 parser = XMLTreeBuilder()
976 parser.feed(text)
977 tree = parser.close()
978 ids = {}
979 for elem in tree.getiterator():
980 id = elem.get("id")
981 if id:
982 ids[id] = elem
983 return tree, ids
984
985##
986# Parses an XML document from a string constant. Same as {@link #XML}.
987#
988# @def fromstring(text)
989# @param source A string containing XML data.
990# @return An Element instance.
991# @defreturn Element
992
993fromstring = XML
994
995##
996# Generates a string representation of an XML element, including all
997# subelements.
998#
999# @param element An Element instance.
1000# @return An encoded string containing the XML data.
1001# @defreturn string
1002
1003def tostring(element, encoding=None):
1004 class dummy:
1005 pass
1006 data = []
1007 file = dummy()
1008 file.write = data.append
1009 ElementTree(element).write(file, encoding)
1010 return string.join(data, "")
1011
1012##
1013# Generic element structure builder. This builder converts a sequence
1014# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1015# #TreeBuilder.end} method calls to a well-formed element structure.
1016# <p>
1017# You can use this class to build an element structure using a custom XML
1018# parser, or a parser for some other XML-like format.
1019#
1020# @param element_factory Optional element factory. This factory
1021# is called to create new Element instances, as necessary.
1022
1023class TreeBuilder:
1024
1025 def __init__(self, element_factory=None):
1026 self._data = [] # data collector
1027 self._elem = [] # element stack
1028 self._last = None # last element
1029 self._tail = None # true if we're after an end tag
1030 if element_factory is None:
1031 element_factory = _ElementInterface
1032 self._factory = element_factory
1033
1034 ##
1035 # Flushes the parser buffers, and returns the toplevel documen
1036 # element.
1037 #
1038 # @return An Element instance.
1039 # @defreturn Element
1040
1041 def close(self):
1042 assert len(self._elem) == 0, "missing end tags"
1043 assert self._last != None, "missing toplevel element"
1044 return self._last
1045
1046 def _flush(self):
1047 if self._data:
1048 if self._last is not None:
1049 text = string.join(self._data, "")
1050 if self._tail:
1051 assert self._last.tail is None, "internal error (tail)"
1052 self._last.tail = text
1053 else:
1054 assert self._last.text is None, "internal error (text)"
1055 self._last.text = text
1056 self._data = []
1057
1058 ##
1059 # Adds text to the current element.
1060 #
1061 # @param data A string. This should be either an 8-bit string
1062 # containing ASCII text, or a Unicode string.
1063
1064 def data(self, data):
1065 self._data.append(data)
1066
1067 ##
1068 # Opens a new element.
1069 #
1070 # @param tag The element name.
1071 # @param attrib A dictionary containing element attributes.
1072 # @return The opened element.
1073 # @defreturn Element
1074
1075 def start(self, tag, attrs):
1076 self._flush()
1077 self._last = elem = self._factory(tag, attrs)
1078 if self._elem:
1079 self._elem[-1].append(elem)
1080 self._elem.append(elem)
1081 self._tail = 0
1082 return elem
1083
1084 ##
1085 # Closes the current element.
1086 #
1087 # @param tag The element name.
1088 # @return The closed element.
1089 # @defreturn Element
1090
1091 def end(self, tag):
1092 self._flush()
1093 self._last = self._elem.pop()
1094 assert self._last.tag == tag,\
1095 "end tag mismatch (expected %s, got %s)" % (
1096 self._last.tag, tag)
1097 self._tail = 1
1098 return self._last
1099
1100##
1101# Element structure builder for XML source data, based on the
1102# <b>expat</b> parser.
1103#
1104# @keyparam target Target object. If omitted, the builder uses an
1105# instance of the standard {@link #TreeBuilder} class.
1106# @keyparam html Predefine HTML entities. This flag is not supported
1107# by the current implementation.
1108# @see #ElementTree
1109# @see #TreeBuilder
1110
1111class XMLTreeBuilder:
1112
1113 def __init__(self, html=0, target=None):
1114 try:
Fred Drakefbdeaad2006-07-29 16:56:15 +00001115 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001116 except ImportError:
1117 raise ImportError(
1118 "No module named expat; use SimpleXMLTreeBuilder instead"
1119 )
1120 self._parser = parser = expat.ParserCreate(None, "}")
1121 if target is None:
1122 target = TreeBuilder()
1123 self._target = target
1124 self._names = {} # name memo cache
1125 # callbacks
1126 parser.DefaultHandlerExpand = self._default
1127 parser.StartElementHandler = self._start
1128 parser.EndElementHandler = self._end
1129 parser.CharacterDataHandler = self._data
1130 # let expat do the buffering, if supported
1131 try:
1132 self._parser.buffer_text = 1
1133 except AttributeError:
1134 pass
1135 # use new-style attribute handling, if supported
1136 try:
1137 self._parser.ordered_attributes = 1
1138 self._parser.specified_attributes = 1
1139 parser.StartElementHandler = self._start_list
1140 except AttributeError:
1141 pass
1142 encoding = None
1143 if not parser.returns_unicode:
1144 encoding = "utf-8"
1145 # target.xml(encoding, None)
1146 self._doctype = None
1147 self.entity = {}
1148
1149 def _fixtext(self, text):
1150 # convert text string to ascii, if possible
1151 try:
1152 return _encode(text, "ascii")
1153 except UnicodeError:
1154 return text
1155
1156 def _fixname(self, key):
1157 # expand qname, and convert name string to ascii, if possible
1158 try:
1159 name = self._names[key]
1160 except KeyError:
1161 name = key
1162 if "}" in name:
1163 name = "{" + name
1164 self._names[key] = name = self._fixtext(name)
1165 return name
1166
1167 def _start(self, tag, attrib_in):
1168 fixname = self._fixname
1169 tag = fixname(tag)
1170 attrib = {}
1171 for key, value in attrib_in.items():
1172 attrib[fixname(key)] = self._fixtext(value)
1173 return self._target.start(tag, attrib)
1174
1175 def _start_list(self, tag, attrib_in):
1176 fixname = self._fixname
1177 tag = fixname(tag)
1178 attrib = {}
1179 if attrib_in:
1180 for i in range(0, len(attrib_in), 2):
1181 attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
1182 return self._target.start(tag, attrib)
1183
1184 def _data(self, text):
1185 return self._target.data(self._fixtext(text))
1186
1187 def _end(self, tag):
1188 return self._target.end(self._fixname(tag))
1189
1190 def _default(self, text):
1191 prefix = text[:1]
1192 if prefix == "&":
1193 # deal with undefined entities
1194 try:
1195 self._target.data(self.entity[text[1:-1]])
1196 except KeyError:
Fred Drakefbdeaad2006-07-29 16:56:15 +00001197 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001198 raise expat.error(
1199 "undefined entity %s: line %d, column %d" %
1200 (text, self._parser.ErrorLineNumber,
1201 self._parser.ErrorColumnNumber)
1202 )
1203 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1204 self._doctype = [] # inside a doctype declaration
1205 elif self._doctype is not None:
1206 # parse doctype contents
1207 if prefix == ">":
1208 self._doctype = None
1209 return
1210 text = string.strip(text)
1211 if not text:
1212 return
1213 self._doctype.append(text)
1214 n = len(self._doctype)
1215 if n > 2:
1216 type = self._doctype[1]
1217 if type == "PUBLIC" and n == 4:
1218 name, type, pubid, system = self._doctype
1219 elif type == "SYSTEM" and n == 3:
1220 name, type, system = self._doctype
1221 pubid = None
1222 else:
1223 return
1224 if pubid:
1225 pubid = pubid[1:-1]
1226 self.doctype(name, pubid, system[1:-1])
1227 self._doctype = None
1228
1229 ##
1230 # Handles a doctype declaration.
1231 #
1232 # @param name Doctype name.
1233 # @param pubid Public identifier.
1234 # @param system System identifier.
1235
1236 def doctype(self, name, pubid, system):
1237 pass
1238
1239 ##
1240 # Feeds data to the parser.
1241 #
1242 # @param data Encoded data.
1243
1244 def feed(self, data):
1245 self._parser.Parse(data, 0)
1246
1247 ##
1248 # Finishes feeding data to the parser.
1249 #
1250 # @return An element structure.
1251 # @defreturn Element
1252
1253 def close(self):
1254 self._parser.Parse("", 1) # end of data
1255 tree = self._target.close()
1256 del self._target, self._parser # get rid of circular references
1257 return tree
Fredrik Lundhbf84e542006-07-06 12:29:24 +00001258
1259# compatibility
1260XMLParser = XMLTreeBuilder