blob: deaed7d27be1609790b660436ce3fbcc5a137397 [file] [log] [blame]
Armin Rigo9ed73062005-12-14 18:10:45 +00001#
2# ElementTree
3# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
4#
5# light-weight XML support for Python 1.5.2 and later.
6#
7# history:
8# 2001-10-20 fl created (from various sources)
9# 2001-11-01 fl return root from parse method
10# 2002-02-16 fl sort attributes in lexical order
11# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
12# 2002-05-01 fl finished TreeBuilder refactoring
13# 2002-07-14 fl added basic namespace support to ElementTree.write
14# 2002-07-25 fl added QName attribute support
15# 2002-10-20 fl fixed encoding in write
16# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
17# 2002-11-27 fl accept file objects or file names for parse/write
18# 2002-12-04 fl moved XMLTreeBuilder back to this module
19# 2003-01-11 fl fixed entity encoding glitch for us-ascii
20# 2003-02-13 fl added XML literal factory
21# 2003-02-21 fl added ProcessingInstruction/PI factory
22# 2003-05-11 fl added tostring/fromstring helpers
23# 2003-05-26 fl added ElementPath support
24# 2003-07-05 fl added makeelement factory method
25# 2003-07-28 fl added more well-known namespace prefixes
26# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
27# 2003-09-04 fl fall back on emulator if ElementPath is not installed
28# 2003-10-31 fl markup updates
29# 2003-11-15 fl fixed nested namespace bug
30# 2004-03-28 fl added XMLID helper
31# 2004-06-02 fl added default support to findtext
32# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
33# 2004-08-23 fl take advantage of post-2.1 expat features
34# 2005-02-01 fl added iterparse implementation
35# 2005-03-02 fl fixed iterparse support for pre-2.2 versions
36#
37# Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved.
38#
39# fredrik@pythonware.com
40# http://www.pythonware.com
41#
42# --------------------------------------------------------------------
43# The ElementTree toolkit is
44#
45# Copyright (c) 1999-2005 by Fredrik Lundh
46#
47# By obtaining, using, and/or copying this software and/or its
48# associated documentation, you agree that you have read, understood,
49# and will comply with the following terms and conditions:
50#
51# Permission to use, copy, modify, and distribute this software and
52# its associated documentation for any purpose and without fee is
53# hereby granted, provided that the above copyright notice appears in
54# all copies, and that both that copyright notice and this permission
55# notice appear in supporting documentation, and that the name of
56# Secret Labs AB or the author not be used in advertising or publicity
57# pertaining to distribution of the software without specific, written
58# prior permission.
59#
60# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
61# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
62# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
63# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
64# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
65# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
66# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
67# OF THIS SOFTWARE.
68# --------------------------------------------------------------------
69
Fredrik Lundh63168a52005-12-14 22:29:34 +000070# Licensed to PSF under a Contributor Agreement.
71# See http://www.python.org/2.4/license for licensing details.
72
Armin Rigo9ed73062005-12-14 18:10:45 +000073__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
78 "fromstring",
79 "iselement", "iterparse",
80 "parse",
81 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
84 "tostring",
85 "TreeBuilder",
86 "VERSION", "XML",
Thomas Wouters0e3f5912006-08-11 14:57:12 +000087 "XMLParser", "XMLTreeBuilder",
Armin Rigo9ed73062005-12-14 18:10:45 +000088 ]
89
90##
91# The <b>Element</b> type is a flexible container object, designed to
92# store hierarchical data structures in memory. The type can be
93# described as a cross between a list and a dictionary.
94# <p>
95# Each element has a number of properties associated with it:
96# <ul>
97# <li>a <i>tag</i>. This is a string identifying what kind of data
98# this element represents (the element type, in other words).</li>
99# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
100# <li>a <i>text</i> string.</li>
101# <li>an optional <i>tail</i> string.</li>
102# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
103# </ul>
104#
105# To create an element instance, use the {@link #Element} or {@link
106# #SubElement} factory functions.
107# <p>
108# The {@link #ElementTree} class can be used to wrap an element
109# structure, and convert it from and to XML.
110##
111
112import string, sys, re
113
114class _SimpleElementPath:
115 # emulate pre-1.2 find/findtext/findall behaviour
116 def find(self, element, tag):
117 for elem in element:
118 if elem.tag == tag:
119 return elem
120 return None
121 def findtext(self, element, tag, default=None):
122 for elem in element:
123 if elem.tag == tag:
124 return elem.text or ""
125 return default
126 def findall(self, element, tag):
127 if tag[:3] == ".//":
128 return element.getiterator(tag[3:])
129 result = []
130 for elem in element:
131 if elem.tag == tag:
132 result.append(elem)
133 return result
134
135try:
Alex Martellic5c45ba2006-08-21 20:54:38 +0000136 from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000137except ImportError:
138 # FIXME: issue warning in this case?
Alex Martellic5c45ba2006-08-21 20:54:38 +0000139 # TODO: DEFINITELY issue warning here!!!
Armin Rigo9ed73062005-12-14 18:10:45 +0000140 ElementPath = _SimpleElementPath()
141
142# TODO: add support for custom namespace resolvers/default namespaces
143# TODO: add improved support for incremental parsing
144
145VERSION = "1.2.6"
146
147##
148# Internal element class. This class defines the Element interface,
149# and provides a reference implementation of this interface.
150# <p>
151# You should not create instances of this class directly. Use the
152# appropriate factory functions instead, such as {@link #Element}
153# and {@link #SubElement}.
154#
155# @see Element
156# @see SubElement
157# @see Comment
158# @see ProcessingInstruction
159
160class _ElementInterface:
161 # <tag attrib>text<child/>...</tag>tail
162
163 ##
164 # (Attribute) Element tag.
165
166 tag = None
167
168 ##
169 # (Attribute) Element attribute dictionary. Where possible, use
170 # {@link #_ElementInterface.get},
171 # {@link #_ElementInterface.set},
172 # {@link #_ElementInterface.keys}, and
173 # {@link #_ElementInterface.items} to access
174 # element attributes.
175
176 attrib = None
177
178 ##
179 # (Attribute) Text before first subelement. This is either a
180 # string or the value None, if there was no text.
181
182 text = None
183
184 ##
185 # (Attribute) Text after this element's end tag, but before the
186 # next sibling element's start tag. This is either a string or
187 # the value None, if there was no text.
188
189 tail = None # text after end tag, if any
190
191 def __init__(self, tag, attrib):
192 self.tag = tag
193 self.attrib = attrib
194 self._children = []
195
196 def __repr__(self):
197 return "<Element %s at %x>" % (self.tag, id(self))
198
199 ##
200 # Creates a new element object of the same type as this element.
201 #
202 # @param tag Element tag.
203 # @param attrib Element attributes, given as a dictionary.
204 # @return A new element instance.
205
206 def makeelement(self, tag, attrib):
207 return Element(tag, attrib)
208
209 ##
210 # Returns the number of subelements.
211 #
212 # @return The number of subelements.
213
214 def __len__(self):
215 return len(self._children)
216
217 ##
218 # Returns the given subelement.
219 #
220 # @param index What subelement to return.
221 # @return The given subelement.
222 # @exception IndexError If the given element does not exist.
223
224 def __getitem__(self, index):
225 return self._children[index]
226
227 ##
228 # Replaces the given subelement.
229 #
230 # @param index What subelement to replace.
231 # @param element The new element value.
232 # @exception IndexError If the given element does not exist.
233 # @exception AssertionError If element is not a valid object.
234
235 def __setitem__(self, index, element):
236 assert iselement(element)
237 self._children[index] = element
238
239 ##
240 # Deletes the given subelement.
241 #
242 # @param index What subelement to delete.
243 # @exception IndexError If the given element does not exist.
244
245 def __delitem__(self, index):
246 del self._children[index]
247
248 ##
249 # Returns a list containing subelements in the given range.
250 #
251 # @param start The first subelement to return.
252 # @param stop The first subelement that shouldn't be returned.
253 # @return A sequence object containing subelements.
254
255 def __getslice__(self, start, stop):
256 return self._children[start:stop]
257
258 ##
259 # Replaces a number of subelements with elements from a sequence.
260 #
261 # @param start The first subelement to replace.
262 # @param stop The first subelement that shouldn't be replaced.
263 # @param elements A sequence object with zero or more elements.
264 # @exception AssertionError If a sequence member is not a valid object.
265
266 def __setslice__(self, start, stop, elements):
267 for element in elements:
268 assert iselement(element)
269 self._children[start:stop] = list(elements)
270
271 ##
272 # Deletes a number of subelements.
273 #
274 # @param start The first subelement to delete.
275 # @param stop The first subelement to leave in there.
276
277 def __delslice__(self, start, stop):
278 del self._children[start:stop]
279
280 ##
281 # Adds a subelement to the end of this element.
282 #
283 # @param element The element to add.
284 # @exception AssertionError If a sequence member is not a valid object.
285
286 def append(self, element):
287 assert iselement(element)
288 self._children.append(element)
289
290 ##
291 # Inserts a subelement at the given position in this element.
292 #
293 # @param index Where to insert the new subelement.
294 # @exception AssertionError If the element is not a valid object.
295
296 def insert(self, index, element):
297 assert iselement(element)
298 self._children.insert(index, element)
299
300 ##
301 # Removes a matching subelement. Unlike the <b>find</b> methods,
302 # this method compares elements based on identity, not on tag
303 # value or contents.
304 #
305 # @param element What element to remove.
306 # @exception ValueError If a matching element could not be found.
307 # @exception AssertionError If the element is not a valid object.
308
309 def remove(self, element):
310 assert iselement(element)
311 self._children.remove(element)
312
313 ##
314 # Returns all subelements. The elements are returned in document
315 # order.
316 #
317 # @return A list of subelements.
318 # @defreturn list of Element instances
319
320 def getchildren(self):
321 return self._children
322
323 ##
324 # Finds the first matching subelement, by tag name or path.
325 #
326 # @param path What element to look for.
327 # @return The first matching element, or None if no element was found.
328 # @defreturn Element or None
329
330 def find(self, path):
331 return ElementPath.find(self, path)
332
333 ##
334 # Finds text for the first matching subelement, by tag name or path.
335 #
336 # @param path What element to look for.
337 # @param default What to return if the element was not found.
338 # @return The text content of the first matching element, or the
339 # default value no element was found. Note that if the element
340 # has is found, but has no text content, this method returns an
341 # empty string.
342 # @defreturn string
343
344 def findtext(self, path, default=None):
345 return ElementPath.findtext(self, path, default)
346
347 ##
348 # Finds all matching subelements, by tag name or path.
349 #
350 # @param path What element to look for.
351 # @return A list or iterator containing all matching elements,
352 # in document order.
353 # @defreturn list of Element instances
354
355 def findall(self, path):
356 return ElementPath.findall(self, path)
357
358 ##
359 # Resets an element. This function removes all subelements, clears
360 # all attributes, and sets the text and tail attributes to None.
361
362 def clear(self):
363 self.attrib.clear()
364 self._children = []
365 self.text = self.tail = None
366
367 ##
368 # Gets an element attribute.
369 #
370 # @param key What attribute to look for.
371 # @param default What to return if the attribute was not found.
372 # @return The attribute value, or the default value, if the
373 # attribute was not found.
374 # @defreturn string or None
375
376 def get(self, key, default=None):
377 return self.attrib.get(key, default)
378
379 ##
380 # Sets an element attribute.
381 #
382 # @param key What attribute to set.
383 # @param value The attribute value.
384
385 def set(self, key, value):
386 self.attrib[key] = value
387
388 ##
389 # Gets a list of attribute names. The names are returned in an
390 # arbitrary order (just like for an ordinary Python dictionary).
391 #
392 # @return A list of element attribute names.
393 # @defreturn list of strings
394
395 def keys(self):
396 return self.attrib.keys()
397
398 ##
399 # Gets element attributes, as a sequence. The attributes are
400 # returned in an arbitrary order.
401 #
402 # @return A list of (name, value) tuples for all attributes.
403 # @defreturn list of (string, string) tuples
404
405 def items(self):
406 return self.attrib.items()
407
408 ##
409 # Creates a tree iterator. The iterator loops over this element
410 # and all subelements, in document order, and returns all elements
411 # with a matching tag.
412 # <p>
413 # If the tree structure is modified during iteration, the result
414 # is undefined.
415 #
416 # @param tag What tags to look for (default is to return all elements).
417 # @return A list or iterator containing all the matching elements.
418 # @defreturn list or iterator
419
420 def getiterator(self, tag=None):
421 nodes = []
422 if tag == "*":
423 tag = None
424 if tag is None or self.tag == tag:
425 nodes.append(self)
426 for node in self._children:
427 nodes.extend(node.getiterator(tag))
428 return nodes
429
430# compatibility
431_Element = _ElementInterface
432
433##
434# Element factory. This function returns an object implementing the
435# standard Element interface. The exact class or type of that object
436# is implementation dependent, but it will always be compatible with
437# the {@link #_ElementInterface} class in this module.
438# <p>
439# The element name, attribute names, and attribute values can be
440# either 8-bit ASCII strings or Unicode strings.
441#
442# @param tag The element name.
443# @param attrib An optional dictionary, containing element attributes.
444# @param **extra Additional attributes, given as keyword arguments.
445# @return An element instance.
446# @defreturn Element
447
448def Element(tag, attrib={}, **extra):
449 attrib = attrib.copy()
450 attrib.update(extra)
451 return _ElementInterface(tag, attrib)
452
453##
454# Subelement factory. This function creates an element instance, and
455# appends it to an existing element.
456# <p>
457# The element name, attribute names, and attribute values can be
458# either 8-bit ASCII strings or Unicode strings.
459#
460# @param parent The parent element.
461# @param tag The subelement name.
462# @param attrib An optional dictionary, containing element attributes.
463# @param **extra Additional attributes, given as keyword arguments.
464# @return An element instance.
465# @defreturn Element
466
467def SubElement(parent, tag, attrib={}, **extra):
468 attrib = attrib.copy()
469 attrib.update(extra)
470 element = parent.makeelement(tag, attrib)
471 parent.append(element)
472 return element
473
474##
475# Comment element factory. This factory function creates a special
476# element that will be serialized as an XML comment.
477# <p>
478# The comment string can be either an 8-bit ASCII string or a Unicode
479# string.
480#
481# @param text A string containing the comment string.
482# @return An element instance, representing a comment.
483# @defreturn Element
484
485def Comment(text=None):
486 element = Element(Comment)
487 element.text = text
488 return element
489
490##
491# PI element factory. This factory function creates a special element
492# that will be serialized as an XML processing instruction.
493#
494# @param target A string containing the PI target.
495# @param text A string containing the PI contents, if any.
496# @return An element instance, representing a PI.
497# @defreturn Element
498
499def ProcessingInstruction(target, text=None):
500 element = Element(ProcessingInstruction)
501 element.text = target
502 if text:
503 element.text = element.text + " " + text
504 return element
505
506PI = ProcessingInstruction
507
508##
509# QName wrapper. This can be used to wrap a QName attribute value, in
510# order to get proper namespace handling on output.
511#
512# @param text A string containing the QName value, in the form {uri}local,
513# or, if the tag argument is given, the URI part of a QName.
514# @param tag Optional tag. If given, the first argument is interpreted as
515# an URI, and this argument is interpreted as a local name.
516# @return An opaque object, representing the QName.
517
518class QName:
519 def __init__(self, text_or_uri, tag=None):
520 if tag:
521 text_or_uri = "{%s}%s" % (text_or_uri, tag)
522 self.text = text_or_uri
523 def __str__(self):
524 return self.text
525 def __hash__(self):
526 return hash(self.text)
527 def __cmp__(self, other):
528 if isinstance(other, QName):
529 return cmp(self.text, other.text)
530 return cmp(self.text, other)
531
532##
533# ElementTree wrapper class. This class represents an entire element
534# hierarchy, and adds some extra support for serialization to and from
535# standard XML.
536#
537# @param element Optional root element.
538# @keyparam file Optional file handle or name. If given, the
539# tree is initialized with the contents of this XML file.
540
541class ElementTree:
542
543 def __init__(self, element=None, file=None):
544 assert element is None or iselement(element)
545 self._root = element # first node
546 if file:
547 self.parse(file)
548
549 ##
550 # Gets the root element for this tree.
551 #
552 # @return An element instance.
553 # @defreturn Element
554
555 def getroot(self):
556 return self._root
557
558 ##
559 # Replaces the root element for this tree. This discards the
560 # current contents of the tree, and replaces it with the given
561 # element. Use with care.
562 #
563 # @param element An element instance.
564
565 def _setroot(self, element):
566 assert iselement(element)
567 self._root = element
568
569 ##
570 # Loads an external XML document into this element tree.
571 #
572 # @param source A file name or file object.
573 # @param parser An optional parser instance. If not given, the
574 # standard {@link XMLTreeBuilder} parser is used.
575 # @return The document root element.
576 # @defreturn Element
577
578 def parse(self, source, parser=None):
579 if not hasattr(source, "read"):
580 source = open(source, "rb")
581 if not parser:
582 parser = XMLTreeBuilder()
583 while 1:
584 data = source.read(32768)
585 if not data:
586 break
587 parser.feed(data)
588 self._root = parser.close()
589 return self._root
590
591 ##
592 # Creates a tree iterator for the root element. The iterator loops
593 # over all elements in this tree, in document order.
594 #
595 # @param tag What tags to look for (default is to return all elements)
596 # @return An iterator.
597 # @defreturn iterator
598
599 def getiterator(self, tag=None):
600 assert self._root is not None
601 return self._root.getiterator(tag)
602
603 ##
604 # Finds the first toplevel element with given tag.
605 # Same as getroot().find(path).
606 #
607 # @param path What element to look for.
608 # @return The first matching element, or None if no element was found.
609 # @defreturn Element or None
610
611 def find(self, path):
612 assert self._root is not None
613 if path[:1] == "/":
614 path = "." + path
615 return self._root.find(path)
616
617 ##
618 # Finds the element text for the first toplevel element with given
619 # tag. Same as getroot().findtext(path).
620 #
621 # @param path What toplevel element to look for.
622 # @param default What to return if the element was not found.
623 # @return The text content of the first matching element, or the
624 # default value no element was found. Note that if the element
625 # has is found, but has no text content, this method returns an
626 # empty string.
627 # @defreturn string
628
629 def findtext(self, path, default=None):
630 assert self._root is not None
631 if path[:1] == "/":
632 path = "." + path
633 return self._root.findtext(path, default)
634
635 ##
636 # Finds all toplevel elements with the given tag.
637 # Same as getroot().findall(path).
638 #
639 # @param path What element to look for.
640 # @return A list or iterator containing all matching elements,
641 # in document order.
642 # @defreturn list of Element instances
643
644 def findall(self, path):
645 assert self._root is not None
646 if path[:1] == "/":
647 path = "." + path
648 return self._root.findall(path)
649
650 ##
651 # Writes the element tree to a file, as XML.
652 #
653 # @param file A file name, or a file object opened for writing.
654 # @param encoding Optional output encoding (default is US-ASCII).
655
656 def write(self, file, encoding="us-ascii"):
657 assert self._root is not None
658 if not hasattr(file, "write"):
659 file = open(file, "wb")
660 if not encoding:
661 encoding = "us-ascii"
662 elif encoding != "utf-8" and encoding != "us-ascii":
663 file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
664 self._write(file, self._root, encoding, {})
665
666 def _write(self, file, node, encoding, namespaces):
667 # write XML to file
668 tag = node.tag
669 if tag is Comment:
670 file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
671 elif tag is ProcessingInstruction:
672 file.write("<?%s?>" % _escape_cdata(node.text, encoding))
673 else:
674 items = node.items()
675 xmlns_items = [] # new namespaces in this scope
676 try:
677 if isinstance(tag, QName) or tag[:1] == "{":
678 tag, xmlns = fixtag(tag, namespaces)
679 if xmlns: xmlns_items.append(xmlns)
680 except TypeError:
681 _raise_serialization_error(tag)
682 file.write("<" + _encode(tag, encoding))
683 if items or xmlns_items:
684 items.sort() # lexical order
685 for k, v in items:
686 try:
687 if isinstance(k, QName) or k[:1] == "{":
688 k, xmlns = fixtag(k, namespaces)
689 if xmlns: xmlns_items.append(xmlns)
690 except TypeError:
691 _raise_serialization_error(k)
692 try:
693 if isinstance(v, QName):
694 v, xmlns = fixtag(v, namespaces)
695 if xmlns: xmlns_items.append(xmlns)
696 except TypeError:
697 _raise_serialization_error(v)
698 file.write(" %s=\"%s\"" % (_encode(k, encoding),
699 _escape_attrib(v, encoding)))
700 for k, v in xmlns_items:
701 file.write(" %s=\"%s\"" % (_encode(k, encoding),
702 _escape_attrib(v, encoding)))
703 if node.text or len(node):
704 file.write(">")
705 if node.text:
706 file.write(_escape_cdata(node.text, encoding))
707 for n in node:
708 self._write(file, n, encoding, namespaces)
709 file.write("</" + _encode(tag, encoding) + ">")
710 else:
711 file.write(" />")
712 for k, v in xmlns_items:
713 del namespaces[v]
714 if node.tail:
715 file.write(_escape_cdata(node.tail, encoding))
716
717# --------------------------------------------------------------------
718# helpers
719
720##
721# Checks if an object appears to be a valid element object.
722#
723# @param An element instance.
724# @return A true value if this is an element object.
725# @defreturn flag
726
727def iselement(element):
728 # FIXME: not sure about this; might be a better idea to look
729 # for tag/attrib/text attributes
730 return isinstance(element, _ElementInterface) or hasattr(element, "tag")
731
732##
733# Writes an element tree or element structure to sys.stdout. This
734# function should be used for debugging only.
735# <p>
736# The exact output format is implementation dependent. In this
737# version, it's written as an ordinary XML file.
738#
739# @param elem An element tree or an individual element.
740
741def dump(elem):
742 # debugging
743 if not isinstance(elem, ElementTree):
744 elem = ElementTree(elem)
745 elem.write(sys.stdout)
746 tail = elem.getroot().tail
747 if not tail or tail[-1] != "\n":
748 sys.stdout.write("\n")
749
750def _encode(s, encoding):
751 try:
752 return s.encode(encoding)
753 except AttributeError:
754 return s # 1.5.2: assume the string uses the right encoding
755
756if sys.version[:3] == "1.5":
757 _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
758else:
759 _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
760
761_escape_map = {
762 "&": "&amp;",
763 "<": "&lt;",
764 ">": "&gt;",
765 '"': "&quot;",
766}
767
768_namespace_map = {
769 # "well-known" namespace prefixes
770 "http://www.w3.org/XML/1998/namespace": "xml",
771 "http://www.w3.org/1999/xhtml": "html",
772 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
773 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
774}
775
776def _raise_serialization_error(text):
777 raise TypeError(
778 "cannot serialize %r (type %s)" % (text, type(text).__name__)
779 )
780
781def _encode_entity(text, pattern=_escape):
782 # map reserved and non-ascii characters to numerical entities
783 def escape_entities(m, map=_escape_map):
784 out = []
785 append = out.append
786 for char in m.group():
787 text = map.get(char)
788 if text is None:
789 text = "&#%d;" % ord(char)
790 append(text)
791 return string.join(out, "")
792 try:
793 return _encode(pattern.sub(escape_entities, text), "ascii")
794 except TypeError:
795 _raise_serialization_error(text)
796
797#
798# the following functions assume an ascii-compatible encoding
799# (or "utf-16")
800
801def _escape_cdata(text, encoding=None, replace=string.replace):
802 # escape character data
803 try:
804 if encoding:
805 try:
806 text = _encode(text, encoding)
807 except UnicodeError:
808 return _encode_entity(text)
809 text = replace(text, "&", "&amp;")
810 text = replace(text, "<", "&lt;")
811 text = replace(text, ">", "&gt;")
812 return text
813 except (TypeError, AttributeError):
814 _raise_serialization_error(text)
815
816def _escape_attrib(text, encoding=None, replace=string.replace):
817 # escape attribute value
818 try:
819 if encoding:
820 try:
821 text = _encode(text, encoding)
822 except UnicodeError:
823 return _encode_entity(text)
824 text = replace(text, "&", "&amp;")
825 text = replace(text, "'", "&apos;") # FIXME: overkill
826 text = replace(text, "\"", "&quot;")
827 text = replace(text, "<", "&lt;")
828 text = replace(text, ">", "&gt;")
829 return text
830 except (TypeError, AttributeError):
831 _raise_serialization_error(text)
832
833def fixtag(tag, namespaces):
834 # given a decorated tag (of the form {uri}tag), return prefixed
835 # tag and namespace declaration, if any
836 if isinstance(tag, QName):
837 tag = tag.text
838 namespace_uri, tag = string.split(tag[1:], "}", 1)
839 prefix = namespaces.get(namespace_uri)
840 if prefix is None:
841 prefix = _namespace_map.get(namespace_uri)
842 if prefix is None:
843 prefix = "ns%d" % len(namespaces)
844 namespaces[namespace_uri] = prefix
845 if prefix == "xml":
846 xmlns = None
847 else:
848 xmlns = ("xmlns:%s" % prefix, namespace_uri)
849 else:
850 xmlns = None
851 return "%s:%s" % (prefix, tag), xmlns
852
853##
854# Parses an XML document into an element tree.
855#
856# @param source A filename or file object containing XML data.
857# @param parser An optional parser instance. If not given, the
858# standard {@link XMLTreeBuilder} parser is used.
859# @return An ElementTree instance
860
861def parse(source, parser=None):
862 tree = ElementTree()
863 tree.parse(source, parser)
864 return tree
865
866##
867# Parses an XML document into an element tree incrementally, and reports
868# what's going on to the user.
869#
870# @param source A filename or file object containing XML data.
871# @param events A list of events to report back. If omitted, only "end"
872# events are reported.
873# @return A (event, elem) iterator.
874
875class iterparse:
876
877 def __init__(self, source, events=None):
878 if not hasattr(source, "read"):
879 source = open(source, "rb")
880 self._file = source
881 self._events = []
882 self._index = 0
883 self.root = self._root = None
884 self._parser = XMLTreeBuilder()
885 # wire up the parser for event reporting
886 parser = self._parser._parser
887 append = self._events.append
888 if events is None:
889 events = ["end"]
890 for event in events:
891 if event == "start":
892 try:
893 parser.ordered_attributes = 1
894 parser.specified_attributes = 1
895 def handler(tag, attrib_in, event=event, append=append,
896 start=self._parser._start_list):
897 append((event, start(tag, attrib_in)))
898 parser.StartElementHandler = handler
899 except AttributeError:
900 def handler(tag, attrib_in, event=event, append=append,
901 start=self._parser._start):
902 append((event, start(tag, attrib_in)))
903 parser.StartElementHandler = handler
904 elif event == "end":
905 def handler(tag, event=event, append=append,
906 end=self._parser._end):
907 append((event, end(tag)))
908 parser.EndElementHandler = handler
909 elif event == "start-ns":
910 def handler(prefix, uri, event=event, append=append):
911 try:
912 uri = _encode(uri, "ascii")
913 except UnicodeError:
914 pass
915 append((event, (prefix or "", uri)))
916 parser.StartNamespaceDeclHandler = handler
917 elif event == "end-ns":
918 def handler(prefix, event=event, append=append):
919 append((event, None))
920 parser.EndNamespaceDeclHandler = handler
921
922 def next(self):
923 while 1:
924 try:
925 item = self._events[self._index]
926 except IndexError:
927 if self._parser is None:
928 self.root = self._root
929 try:
930 raise StopIteration
931 except NameError:
932 raise IndexError
933 # load event buffer
934 del self._events[:]
935 self._index = 0
936 data = self._file.read(16384)
937 if data:
938 self._parser.feed(data)
939 else:
940 self._root = self._parser.close()
941 self._parser = None
942 else:
943 self._index = self._index + 1
944 return item
945
946 try:
947 iter
948 def __iter__(self):
949 return self
950 except NameError:
951 def __getitem__(self, index):
952 return self.next()
953
954##
955# Parses an XML document from a string constant. This function can
956# be used to embed "XML literals" in Python code.
957#
958# @param source A string containing XML data.
959# @return An Element instance.
960# @defreturn Element
961
962def XML(text):
963 parser = XMLTreeBuilder()
964 parser.feed(text)
965 return parser.close()
966
967##
968# Parses an XML document from a string constant, and also returns
969# a dictionary which maps from element id:s to elements.
970#
971# @param source A string containing XML data.
972# @return A tuple containing an Element instance and a dictionary.
973# @defreturn (Element, dictionary)
974
975def XMLID(text):
976 parser = XMLTreeBuilder()
977 parser.feed(text)
978 tree = parser.close()
979 ids = {}
980 for elem in tree.getiterator():
981 id = elem.get("id")
982 if id:
983 ids[id] = elem
984 return tree, ids
985
986##
987# Parses an XML document from a string constant. Same as {@link #XML}.
988#
989# @def fromstring(text)
990# @param source A string containing XML data.
991# @return An Element instance.
992# @defreturn Element
993
994fromstring = XML
995
996##
997# Generates a string representation of an XML element, including all
998# subelements.
999#
1000# @param element An Element instance.
1001# @return An encoded string containing the XML data.
1002# @defreturn string
1003
1004def tostring(element, encoding=None):
1005 class dummy:
1006 pass
1007 data = []
1008 file = dummy()
1009 file.write = data.append
1010 ElementTree(element).write(file, encoding)
1011 return string.join(data, "")
1012
1013##
1014# Generic element structure builder. This builder converts a sequence
1015# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1016# #TreeBuilder.end} method calls to a well-formed element structure.
1017# <p>
1018# You can use this class to build an element structure using a custom XML
1019# parser, or a parser for some other XML-like format.
1020#
1021# @param element_factory Optional element factory. This factory
1022# is called to create new Element instances, as necessary.
1023
1024class TreeBuilder:
1025
1026 def __init__(self, element_factory=None):
1027 self._data = [] # data collector
1028 self._elem = [] # element stack
1029 self._last = None # last element
1030 self._tail = None # true if we're after an end tag
1031 if element_factory is None:
1032 element_factory = _ElementInterface
1033 self._factory = element_factory
1034
1035 ##
1036 # Flushes the parser buffers, and returns the toplevel documen
1037 # element.
1038 #
1039 # @return An Element instance.
1040 # @defreturn Element
1041
1042 def close(self):
1043 assert len(self._elem) == 0, "missing end tags"
1044 assert self._last != None, "missing toplevel element"
1045 return self._last
1046
1047 def _flush(self):
1048 if self._data:
1049 if self._last is not None:
1050 text = string.join(self._data, "")
1051 if self._tail:
1052 assert self._last.tail is None, "internal error (tail)"
1053 self._last.tail = text
1054 else:
1055 assert self._last.text is None, "internal error (text)"
1056 self._last.text = text
1057 self._data = []
1058
1059 ##
1060 # Adds text to the current element.
1061 #
1062 # @param data A string. This should be either an 8-bit string
1063 # containing ASCII text, or a Unicode string.
1064
1065 def data(self, data):
1066 self._data.append(data)
1067
1068 ##
1069 # Opens a new element.
1070 #
1071 # @param tag The element name.
1072 # @param attrib A dictionary containing element attributes.
1073 # @return The opened element.
1074 # @defreturn Element
1075
1076 def start(self, tag, attrs):
1077 self._flush()
1078 self._last = elem = self._factory(tag, attrs)
1079 if self._elem:
1080 self._elem[-1].append(elem)
1081 self._elem.append(elem)
1082 self._tail = 0
1083 return elem
1084
1085 ##
1086 # Closes the current element.
1087 #
1088 # @param tag The element name.
1089 # @return The closed element.
1090 # @defreturn Element
1091
1092 def end(self, tag):
1093 self._flush()
1094 self._last = self._elem.pop()
1095 assert self._last.tag == tag,\
1096 "end tag mismatch (expected %s, got %s)" % (
1097 self._last.tag, tag)
1098 self._tail = 1
1099 return self._last
1100
1101##
1102# Element structure builder for XML source data, based on the
1103# <b>expat</b> parser.
1104#
1105# @keyparam target Target object. If omitted, the builder uses an
1106# instance of the standard {@link #TreeBuilder} class.
1107# @keyparam html Predefine HTML entities. This flag is not supported
1108# by the current implementation.
1109# @see #ElementTree
1110# @see #TreeBuilder
1111
1112class XMLTreeBuilder:
1113
1114 def __init__(self, html=0, target=None):
1115 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001116 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001117 except ImportError:
1118 raise ImportError(
1119 "No module named expat; use SimpleXMLTreeBuilder instead"
1120 )
1121 self._parser = parser = expat.ParserCreate(None, "}")
1122 if target is None:
1123 target = TreeBuilder()
1124 self._target = target
1125 self._names = {} # name memo cache
1126 # callbacks
1127 parser.DefaultHandlerExpand = self._default
1128 parser.StartElementHandler = self._start
1129 parser.EndElementHandler = self._end
1130 parser.CharacterDataHandler = self._data
1131 # let expat do the buffering, if supported
1132 try:
1133 self._parser.buffer_text = 1
1134 except AttributeError:
1135 pass
1136 # use new-style attribute handling, if supported
1137 try:
1138 self._parser.ordered_attributes = 1
1139 self._parser.specified_attributes = 1
1140 parser.StartElementHandler = self._start_list
1141 except AttributeError:
1142 pass
1143 encoding = None
1144 if not parser.returns_unicode:
1145 encoding = "utf-8"
1146 # target.xml(encoding, None)
1147 self._doctype = None
1148 self.entity = {}
1149
1150 def _fixtext(self, text):
1151 # convert text string to ascii, if possible
1152 try:
1153 return _encode(text, "ascii")
1154 except UnicodeError:
1155 return text
1156
1157 def _fixname(self, key):
1158 # expand qname, and convert name string to ascii, if possible
1159 try:
1160 name = self._names[key]
1161 except KeyError:
1162 name = key
1163 if "}" in name:
1164 name = "{" + name
1165 self._names[key] = name = self._fixtext(name)
1166 return name
1167
1168 def _start(self, tag, attrib_in):
1169 fixname = self._fixname
1170 tag = fixname(tag)
1171 attrib = {}
1172 for key, value in attrib_in.items():
1173 attrib[fixname(key)] = self._fixtext(value)
1174 return self._target.start(tag, attrib)
1175
1176 def _start_list(self, tag, attrib_in):
1177 fixname = self._fixname
1178 tag = fixname(tag)
1179 attrib = {}
1180 if attrib_in:
1181 for i in range(0, len(attrib_in), 2):
1182 attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
1183 return self._target.start(tag, attrib)
1184
1185 def _data(self, text):
1186 return self._target.data(self._fixtext(text))
1187
1188 def _end(self, tag):
1189 return self._target.end(self._fixname(tag))
1190
1191 def _default(self, text):
1192 prefix = text[:1]
1193 if prefix == "&":
1194 # deal with undefined entities
1195 try:
1196 self._target.data(self.entity[text[1:-1]])
1197 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001198 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001199 raise expat.error(
1200 "undefined entity %s: line %d, column %d" %
1201 (text, self._parser.ErrorLineNumber,
1202 self._parser.ErrorColumnNumber)
1203 )
1204 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1205 self._doctype = [] # inside a doctype declaration
1206 elif self._doctype is not None:
1207 # parse doctype contents
1208 if prefix == ">":
1209 self._doctype = None
1210 return
1211 text = string.strip(text)
1212 if not text:
1213 return
1214 self._doctype.append(text)
1215 n = len(self._doctype)
1216 if n > 2:
1217 type = self._doctype[1]
1218 if type == "PUBLIC" and n == 4:
1219 name, type, pubid, system = self._doctype
1220 elif type == "SYSTEM" and n == 3:
1221 name, type, system = self._doctype
1222 pubid = None
1223 else:
1224 return
1225 if pubid:
1226 pubid = pubid[1:-1]
1227 self.doctype(name, pubid, system[1:-1])
1228 self._doctype = None
1229
1230 ##
1231 # Handles a doctype declaration.
1232 #
1233 # @param name Doctype name.
1234 # @param pubid Public identifier.
1235 # @param system System identifier.
1236
1237 def doctype(self, name, pubid, system):
1238 pass
1239
1240 ##
1241 # Feeds data to the parser.
1242 #
1243 # @param data Encoded data.
1244
1245 def feed(self, data):
1246 self._parser.Parse(data, 0)
1247
1248 ##
1249 # Finishes feeding data to the parser.
1250 #
1251 # @return An element structure.
1252 # @defreturn Element
1253
1254 def close(self):
1255 self._parser.Parse("", 1) # end of data
1256 tree = self._target.close()
1257 del self._target, self._parser # get rid of circular references
1258 return tree
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001259
1260# compatibility
1261XMLParser = XMLTreeBuilder