blob: 694d6c4a9091629d017232635dba75660cc82df7 [file] [log] [blame]
Armin Rigo9ed73062005-12-14 18:10:45 +00001#
2# ElementTree
3# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
4#
5# light-weight XML support for Python 1.5.2 and later.
6#
7# history:
8# 2001-10-20 fl created (from various sources)
9# 2001-11-01 fl return root from parse method
10# 2002-02-16 fl sort attributes in lexical order
11# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
12# 2002-05-01 fl finished TreeBuilder refactoring
13# 2002-07-14 fl added basic namespace support to ElementTree.write
14# 2002-07-25 fl added QName attribute support
15# 2002-10-20 fl fixed encoding in write
16# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
17# 2002-11-27 fl accept file objects or file names for parse/write
18# 2002-12-04 fl moved XMLTreeBuilder back to this module
19# 2003-01-11 fl fixed entity encoding glitch for us-ascii
20# 2003-02-13 fl added XML literal factory
21# 2003-02-21 fl added ProcessingInstruction/PI factory
22# 2003-05-11 fl added tostring/fromstring helpers
23# 2003-05-26 fl added ElementPath support
24# 2003-07-05 fl added makeelement factory method
25# 2003-07-28 fl added more well-known namespace prefixes
26# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
27# 2003-09-04 fl fall back on emulator if ElementPath is not installed
28# 2003-10-31 fl markup updates
29# 2003-11-15 fl fixed nested namespace bug
30# 2004-03-28 fl added XMLID helper
31# 2004-06-02 fl added default support to findtext
32# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
33# 2004-08-23 fl take advantage of post-2.1 expat features
34# 2005-02-01 fl added iterparse implementation
35# 2005-03-02 fl fixed iterparse support for pre-2.2 versions
36#
37# Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved.
38#
39# fredrik@pythonware.com
40# http://www.pythonware.com
41#
42# --------------------------------------------------------------------
43# The ElementTree toolkit is
44#
45# Copyright (c) 1999-2005 by Fredrik Lundh
46#
47# By obtaining, using, and/or copying this software and/or its
48# associated documentation, you agree that you have read, understood,
49# and will comply with the following terms and conditions:
50#
51# Permission to use, copy, modify, and distribute this software and
52# its associated documentation for any purpose and without fee is
53# hereby granted, provided that the above copyright notice appears in
54# all copies, and that both that copyright notice and this permission
55# notice appear in supporting documentation, and that the name of
56# Secret Labs AB or the author not be used in advertising or publicity
57# pertaining to distribution of the software without specific, written
58# prior permission.
59#
60# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
61# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
62# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
63# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
64# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
65# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
66# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
67# OF THIS SOFTWARE.
68# --------------------------------------------------------------------
69
Fredrik Lundh63168a52005-12-14 22:29:34 +000070# Licensed to PSF under a Contributor Agreement.
71# See http://www.python.org/2.4/license for licensing details.
72
Armin Rigo9ed73062005-12-14 18:10:45 +000073__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
78 "fromstring",
79 "iselement", "iterparse",
80 "parse",
81 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
84 "tostring",
85 "TreeBuilder",
86 "VERSION", "XML",
Thomas Wouters0e3f5912006-08-11 14:57:12 +000087 "XMLParser", "XMLTreeBuilder",
Armin Rigo9ed73062005-12-14 18:10:45 +000088 ]
89
90##
91# The <b>Element</b> type is a flexible container object, designed to
92# store hierarchical data structures in memory. The type can be
93# described as a cross between a list and a dictionary.
94# <p>
95# Each element has a number of properties associated with it:
96# <ul>
97# <li>a <i>tag</i>. This is a string identifying what kind of data
98# this element represents (the element type, in other words).</li>
99# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
100# <li>a <i>text</i> string.</li>
101# <li>an optional <i>tail</i> string.</li>
102# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
103# </ul>
104#
105# To create an element instance, use the {@link #Element} or {@link
106# #SubElement} factory functions.
107# <p>
108# The {@link #ElementTree} class can be used to wrap an element
109# structure, and convert it from and to XML.
110##
111
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000112import sys, re
Armin Rigo9ed73062005-12-14 18:10:45 +0000113
Alex Martelli6cefeb02006-08-21 23:45:19 +0000114from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000115
116# TODO: add support for custom namespace resolvers/default namespaces
117# TODO: add improved support for incremental parsing
118
119VERSION = "1.2.6"
120
121##
122# Internal element class. This class defines the Element interface,
123# and provides a reference implementation of this interface.
124# <p>
125# You should not create instances of this class directly. Use the
126# appropriate factory functions instead, such as {@link #Element}
127# and {@link #SubElement}.
128#
129# @see Element
130# @see SubElement
131# @see Comment
132# @see ProcessingInstruction
133
134class _ElementInterface:
135 # <tag attrib>text<child/>...</tag>tail
136
137 ##
138 # (Attribute) Element tag.
139
140 tag = None
141
142 ##
143 # (Attribute) Element attribute dictionary. Where possible, use
144 # {@link #_ElementInterface.get},
145 # {@link #_ElementInterface.set},
146 # {@link #_ElementInterface.keys}, and
147 # {@link #_ElementInterface.items} to access
148 # element attributes.
149
150 attrib = None
151
152 ##
153 # (Attribute) Text before first subelement. This is either a
154 # string or the value None, if there was no text.
155
156 text = None
157
158 ##
159 # (Attribute) Text after this element's end tag, but before the
160 # next sibling element's start tag. This is either a string or
161 # the value None, if there was no text.
162
163 tail = None # text after end tag, if any
164
165 def __init__(self, tag, attrib):
166 self.tag = tag
167 self.attrib = attrib
168 self._children = []
169
170 def __repr__(self):
171 return "<Element %s at %x>" % (self.tag, id(self))
172
173 ##
174 # Creates a new element object of the same type as this element.
175 #
176 # @param tag Element tag.
177 # @param attrib Element attributes, given as a dictionary.
178 # @return A new element instance.
179
180 def makeelement(self, tag, attrib):
181 return Element(tag, attrib)
182
183 ##
184 # Returns the number of subelements.
185 #
186 # @return The number of subelements.
187
188 def __len__(self):
189 return len(self._children)
190
191 ##
192 # Returns the given subelement.
193 #
194 # @param index What subelement to return.
195 # @return The given subelement.
196 # @exception IndexError If the given element does not exist.
197
198 def __getitem__(self, index):
199 return self._children[index]
200
201 ##
202 # Replaces the given subelement.
203 #
204 # @param index What subelement to replace.
205 # @param element The new element value.
206 # @exception IndexError If the given element does not exist.
207 # @exception AssertionError If element is not a valid object.
208
209 def __setitem__(self, index, element):
210 assert iselement(element)
211 self._children[index] = element
212
213 ##
214 # Deletes the given subelement.
215 #
216 # @param index What subelement to delete.
217 # @exception IndexError If the given element does not exist.
218
219 def __delitem__(self, index):
220 del self._children[index]
221
222 ##
223 # Returns a list containing subelements in the given range.
224 #
225 # @param start The first subelement to return.
226 # @param stop The first subelement that shouldn't be returned.
227 # @return A sequence object containing subelements.
228
229 def __getslice__(self, start, stop):
230 return self._children[start:stop]
231
232 ##
233 # Replaces a number of subelements with elements from a sequence.
234 #
235 # @param start The first subelement to replace.
236 # @param stop The first subelement that shouldn't be replaced.
237 # @param elements A sequence object with zero or more elements.
238 # @exception AssertionError If a sequence member is not a valid object.
239
240 def __setslice__(self, start, stop, elements):
241 for element in elements:
242 assert iselement(element)
243 self._children[start:stop] = list(elements)
244
245 ##
246 # Deletes a number of subelements.
247 #
248 # @param start The first subelement to delete.
249 # @param stop The first subelement to leave in there.
250
251 def __delslice__(self, start, stop):
252 del self._children[start:stop]
253
254 ##
255 # Adds a subelement to the end of this element.
256 #
257 # @param element The element to add.
258 # @exception AssertionError If a sequence member is not a valid object.
259
260 def append(self, element):
261 assert iselement(element)
262 self._children.append(element)
263
264 ##
265 # Inserts a subelement at the given position in this element.
266 #
267 # @param index Where to insert the new subelement.
268 # @exception AssertionError If the element is not a valid object.
269
270 def insert(self, index, element):
271 assert iselement(element)
272 self._children.insert(index, element)
273
274 ##
275 # Removes a matching subelement. Unlike the <b>find</b> methods,
276 # this method compares elements based on identity, not on tag
277 # value or contents.
278 #
279 # @param element What element to remove.
280 # @exception ValueError If a matching element could not be found.
281 # @exception AssertionError If the element is not a valid object.
282
283 def remove(self, element):
284 assert iselement(element)
285 self._children.remove(element)
286
287 ##
288 # Returns all subelements. The elements are returned in document
289 # order.
290 #
291 # @return A list of subelements.
292 # @defreturn list of Element instances
293
294 def getchildren(self):
295 return self._children
296
297 ##
298 # Finds the first matching subelement, by tag name or path.
299 #
300 # @param path What element to look for.
301 # @return The first matching element, or None if no element was found.
302 # @defreturn Element or None
303
304 def find(self, path):
305 return ElementPath.find(self, path)
306
307 ##
308 # Finds text for the first matching subelement, by tag name or path.
309 #
310 # @param path What element to look for.
311 # @param default What to return if the element was not found.
312 # @return The text content of the first matching element, or the
313 # default value no element was found. Note that if the element
314 # has is found, but has no text content, this method returns an
315 # empty string.
316 # @defreturn string
317
318 def findtext(self, path, default=None):
319 return ElementPath.findtext(self, path, default)
320
321 ##
322 # Finds all matching subelements, by tag name or path.
323 #
324 # @param path What element to look for.
325 # @return A list or iterator containing all matching elements,
326 # in document order.
327 # @defreturn list of Element instances
328
329 def findall(self, path):
330 return ElementPath.findall(self, path)
331
332 ##
333 # Resets an element. This function removes all subelements, clears
334 # all attributes, and sets the text and tail attributes to None.
335
336 def clear(self):
337 self.attrib.clear()
338 self._children = []
339 self.text = self.tail = None
340
341 ##
342 # Gets an element attribute.
343 #
344 # @param key What attribute to look for.
345 # @param default What to return if the attribute was not found.
346 # @return The attribute value, or the default value, if the
347 # attribute was not found.
348 # @defreturn string or None
349
350 def get(self, key, default=None):
351 return self.attrib.get(key, default)
352
353 ##
354 # Sets an element attribute.
355 #
356 # @param key What attribute to set.
357 # @param value The attribute value.
358
359 def set(self, key, value):
360 self.attrib[key] = value
361
362 ##
363 # Gets a list of attribute names. The names are returned in an
364 # arbitrary order (just like for an ordinary Python dictionary).
365 #
366 # @return A list of element attribute names.
367 # @defreturn list of strings
368
369 def keys(self):
370 return self.attrib.keys()
371
372 ##
373 # Gets element attributes, as a sequence. The attributes are
374 # returned in an arbitrary order.
375 #
376 # @return A list of (name, value) tuples for all attributes.
377 # @defreturn list of (string, string) tuples
378
379 def items(self):
380 return self.attrib.items()
381
382 ##
383 # Creates a tree iterator. The iterator loops over this element
384 # and all subelements, in document order, and returns all elements
385 # with a matching tag.
386 # <p>
387 # If the tree structure is modified during iteration, the result
388 # is undefined.
389 #
390 # @param tag What tags to look for (default is to return all elements).
391 # @return A list or iterator containing all the matching elements.
392 # @defreturn list or iterator
393
394 def getiterator(self, tag=None):
395 nodes = []
396 if tag == "*":
397 tag = None
398 if tag is None or self.tag == tag:
399 nodes.append(self)
400 for node in self._children:
401 nodes.extend(node.getiterator(tag))
402 return nodes
403
404# compatibility
405_Element = _ElementInterface
406
407##
408# Element factory. This function returns an object implementing the
409# standard Element interface. The exact class or type of that object
410# is implementation dependent, but it will always be compatible with
411# the {@link #_ElementInterface} class in this module.
412# <p>
413# The element name, attribute names, and attribute values can be
414# either 8-bit ASCII strings or Unicode strings.
415#
416# @param tag The element name.
417# @param attrib An optional dictionary, containing element attributes.
418# @param **extra Additional attributes, given as keyword arguments.
419# @return An element instance.
420# @defreturn Element
421
422def Element(tag, attrib={}, **extra):
423 attrib = attrib.copy()
424 attrib.update(extra)
425 return _ElementInterface(tag, attrib)
426
427##
428# Subelement factory. This function creates an element instance, and
429# appends it to an existing element.
430# <p>
431# The element name, attribute names, and attribute values can be
432# either 8-bit ASCII strings or Unicode strings.
433#
434# @param parent The parent element.
435# @param tag The subelement name.
436# @param attrib An optional dictionary, containing element attributes.
437# @param **extra Additional attributes, given as keyword arguments.
438# @return An element instance.
439# @defreturn Element
440
441def SubElement(parent, tag, attrib={}, **extra):
442 attrib = attrib.copy()
443 attrib.update(extra)
444 element = parent.makeelement(tag, attrib)
445 parent.append(element)
446 return element
447
448##
449# Comment element factory. This factory function creates a special
450# element that will be serialized as an XML comment.
451# <p>
452# The comment string can be either an 8-bit ASCII string or a Unicode
453# string.
454#
455# @param text A string containing the comment string.
456# @return An element instance, representing a comment.
457# @defreturn Element
458
459def Comment(text=None):
460 element = Element(Comment)
461 element.text = text
462 return element
463
464##
465# PI element factory. This factory function creates a special element
466# that will be serialized as an XML processing instruction.
467#
468# @param target A string containing the PI target.
469# @param text A string containing the PI contents, if any.
470# @return An element instance, representing a PI.
471# @defreturn Element
472
473def ProcessingInstruction(target, text=None):
474 element = Element(ProcessingInstruction)
475 element.text = target
476 if text:
477 element.text = element.text + " " + text
478 return element
479
480PI = ProcessingInstruction
481
482##
483# QName wrapper. This can be used to wrap a QName attribute value, in
484# order to get proper namespace handling on output.
485#
486# @param text A string containing the QName value, in the form {uri}local,
487# or, if the tag argument is given, the URI part of a QName.
488# @param tag Optional tag. If given, the first argument is interpreted as
489# an URI, and this argument is interpreted as a local name.
490# @return An opaque object, representing the QName.
491
492class QName:
493 def __init__(self, text_or_uri, tag=None):
494 if tag:
495 text_or_uri = "{%s}%s" % (text_or_uri, tag)
496 self.text = text_or_uri
497 def __str__(self):
498 return self.text
499 def __hash__(self):
500 return hash(self.text)
501 def __cmp__(self, other):
502 if isinstance(other, QName):
503 return cmp(self.text, other.text)
504 return cmp(self.text, other)
505
506##
507# ElementTree wrapper class. This class represents an entire element
508# hierarchy, and adds some extra support for serialization to and from
509# standard XML.
510#
511# @param element Optional root element.
512# @keyparam file Optional file handle or name. If given, the
513# tree is initialized with the contents of this XML file.
514
515class ElementTree:
516
517 def __init__(self, element=None, file=None):
518 assert element is None or iselement(element)
519 self._root = element # first node
520 if file:
521 self.parse(file)
522
523 ##
524 # Gets the root element for this tree.
525 #
526 # @return An element instance.
527 # @defreturn Element
528
529 def getroot(self):
530 return self._root
531
532 ##
533 # Replaces the root element for this tree. This discards the
534 # current contents of the tree, and replaces it with the given
535 # element. Use with care.
536 #
537 # @param element An element instance.
538
539 def _setroot(self, element):
540 assert iselement(element)
541 self._root = element
542
543 ##
544 # Loads an external XML document into this element tree.
545 #
546 # @param source A file name or file object.
547 # @param parser An optional parser instance. If not given, the
548 # standard {@link XMLTreeBuilder} parser is used.
549 # @return The document root element.
550 # @defreturn Element
551
552 def parse(self, source, parser=None):
553 if not hasattr(source, "read"):
554 source = open(source, "rb")
555 if not parser:
556 parser = XMLTreeBuilder()
557 while 1:
558 data = source.read(32768)
559 if not data:
560 break
561 parser.feed(data)
562 self._root = parser.close()
563 return self._root
564
565 ##
566 # Creates a tree iterator for the root element. The iterator loops
567 # over all elements in this tree, in document order.
568 #
569 # @param tag What tags to look for (default is to return all elements)
570 # @return An iterator.
571 # @defreturn iterator
572
573 def getiterator(self, tag=None):
574 assert self._root is not None
575 return self._root.getiterator(tag)
576
577 ##
578 # Finds the first toplevel element with given tag.
579 # Same as getroot().find(path).
580 #
581 # @param path What element to look for.
582 # @return The first matching element, or None if no element was found.
583 # @defreturn Element or None
584
585 def find(self, path):
586 assert self._root is not None
587 if path[:1] == "/":
588 path = "." + path
589 return self._root.find(path)
590
591 ##
592 # Finds the element text for the first toplevel element with given
593 # tag. Same as getroot().findtext(path).
594 #
595 # @param path What toplevel element to look for.
596 # @param default What to return if the element was not found.
597 # @return The text content of the first matching element, or the
598 # default value no element was found. Note that if the element
599 # has is found, but has no text content, this method returns an
600 # empty string.
601 # @defreturn string
602
603 def findtext(self, path, default=None):
604 assert self._root is not None
605 if path[:1] == "/":
606 path = "." + path
607 return self._root.findtext(path, default)
608
609 ##
610 # Finds all toplevel elements with the given tag.
611 # Same as getroot().findall(path).
612 #
613 # @param path What element to look for.
614 # @return A list or iterator containing all matching elements,
615 # in document order.
616 # @defreturn list of Element instances
617
618 def findall(self, path):
619 assert self._root is not None
620 if path[:1] == "/":
621 path = "." + path
622 return self._root.findall(path)
623
624 ##
625 # Writes the element tree to a file, as XML.
626 #
627 # @param file A file name, or a file object opened for writing.
628 # @param encoding Optional output encoding (default is US-ASCII).
629
630 def write(self, file, encoding="us-ascii"):
631 assert self._root is not None
632 if not hasattr(file, "write"):
633 file = open(file, "wb")
634 if not encoding:
635 encoding = "us-ascii"
636 elif encoding != "utf-8" and encoding != "us-ascii":
637 file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
638 self._write(file, self._root, encoding, {})
639
640 def _write(self, file, node, encoding, namespaces):
641 # write XML to file
642 tag = node.tag
643 if tag is Comment:
644 file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
645 elif tag is ProcessingInstruction:
646 file.write("<?%s?>" % _escape_cdata(node.text, encoding))
647 else:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000648 items = list(node.items())
Armin Rigo9ed73062005-12-14 18:10:45 +0000649 xmlns_items = [] # new namespaces in this scope
650 try:
651 if isinstance(tag, QName) or tag[:1] == "{":
652 tag, xmlns = fixtag(tag, namespaces)
653 if xmlns: xmlns_items.append(xmlns)
654 except TypeError:
655 _raise_serialization_error(tag)
656 file.write("<" + _encode(tag, encoding))
657 if items or xmlns_items:
658 items.sort() # lexical order
659 for k, v in items:
660 try:
661 if isinstance(k, QName) or k[:1] == "{":
662 k, xmlns = fixtag(k, namespaces)
663 if xmlns: xmlns_items.append(xmlns)
664 except TypeError:
665 _raise_serialization_error(k)
666 try:
667 if isinstance(v, QName):
668 v, xmlns = fixtag(v, namespaces)
669 if xmlns: xmlns_items.append(xmlns)
670 except TypeError:
671 _raise_serialization_error(v)
672 file.write(" %s=\"%s\"" % (_encode(k, encoding),
673 _escape_attrib(v, encoding)))
674 for k, v in xmlns_items:
675 file.write(" %s=\"%s\"" % (_encode(k, encoding),
676 _escape_attrib(v, encoding)))
677 if node.text or len(node):
678 file.write(">")
679 if node.text:
680 file.write(_escape_cdata(node.text, encoding))
681 for n in node:
682 self._write(file, n, encoding, namespaces)
683 file.write("</" + _encode(tag, encoding) + ">")
684 else:
685 file.write(" />")
686 for k, v in xmlns_items:
687 del namespaces[v]
688 if node.tail:
689 file.write(_escape_cdata(node.tail, encoding))
690
691# --------------------------------------------------------------------
692# helpers
693
694##
695# Checks if an object appears to be a valid element object.
696#
697# @param An element instance.
698# @return A true value if this is an element object.
699# @defreturn flag
700
701def iselement(element):
702 # FIXME: not sure about this; might be a better idea to look
703 # for tag/attrib/text attributes
704 return isinstance(element, _ElementInterface) or hasattr(element, "tag")
705
706##
707# Writes an element tree or element structure to sys.stdout. This
708# function should be used for debugging only.
709# <p>
710# The exact output format is implementation dependent. In this
711# version, it's written as an ordinary XML file.
712#
713# @param elem An element tree or an individual element.
714
715def dump(elem):
716 # debugging
717 if not isinstance(elem, ElementTree):
718 elem = ElementTree(elem)
719 elem.write(sys.stdout)
720 tail = elem.getroot().tail
721 if not tail or tail[-1] != "\n":
722 sys.stdout.write("\n")
723
724def _encode(s, encoding):
725 try:
726 return s.encode(encoding)
727 except AttributeError:
728 return s # 1.5.2: assume the string uses the right encoding
729
730if sys.version[:3] == "1.5":
731 _escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
732else:
733 _escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
734
735_escape_map = {
736 "&": "&amp;",
737 "<": "&lt;",
738 ">": "&gt;",
739 '"': "&quot;",
740}
741
742_namespace_map = {
743 # "well-known" namespace prefixes
744 "http://www.w3.org/XML/1998/namespace": "xml",
745 "http://www.w3.org/1999/xhtml": "html",
746 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
747 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
748}
749
750def _raise_serialization_error(text):
751 raise TypeError(
752 "cannot serialize %r (type %s)" % (text, type(text).__name__)
753 )
754
755def _encode_entity(text, pattern=_escape):
756 # map reserved and non-ascii characters to numerical entities
757 def escape_entities(m, map=_escape_map):
758 out = []
759 append = out.append
760 for char in m.group():
761 text = map.get(char)
762 if text is None:
763 text = "&#%d;" % ord(char)
764 append(text)
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000765 return "".join(out)
Armin Rigo9ed73062005-12-14 18:10:45 +0000766 try:
767 return _encode(pattern.sub(escape_entities, text), "ascii")
768 except TypeError:
769 _raise_serialization_error(text)
770
771#
772# the following functions assume an ascii-compatible encoding
773# (or "utf-16")
774
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000775def _escape_cdata(text, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +0000776 # escape character data
777 try:
778 if encoding:
779 try:
780 text = _encode(text, encoding)
781 except UnicodeError:
782 return _encode_entity(text)
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000783 text = text.replace("&", "&amp;")
784 text = text.replace("<", "&lt;")
785 text = text.replace(">", "&gt;")
Armin Rigo9ed73062005-12-14 18:10:45 +0000786 return text
787 except (TypeError, AttributeError):
788 _raise_serialization_error(text)
789
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000790def _escape_attrib(text, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +0000791 # escape attribute value
792 try:
793 if encoding:
794 try:
795 text = _encode(text, encoding)
796 except UnicodeError:
797 return _encode_entity(text)
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000798 text = text.replace("&", "&amp;")
799 text = text.replace("'", "&apos;") # FIXME: overkill
800 text = text.replace("\"", "&quot;")
801 text = text.replace("<", "&lt;")
802 text = text.replace(">", "&gt;")
Armin Rigo9ed73062005-12-14 18:10:45 +0000803 return text
804 except (TypeError, AttributeError):
805 _raise_serialization_error(text)
806
807def fixtag(tag, namespaces):
808 # given a decorated tag (of the form {uri}tag), return prefixed
809 # tag and namespace declaration, if any
810 if isinstance(tag, QName):
811 tag = tag.text
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000812 namespace_uri, tag = tag[1:].split("}", 1)
Armin Rigo9ed73062005-12-14 18:10:45 +0000813 prefix = namespaces.get(namespace_uri)
814 if prefix is None:
815 prefix = _namespace_map.get(namespace_uri)
816 if prefix is None:
817 prefix = "ns%d" % len(namespaces)
818 namespaces[namespace_uri] = prefix
819 if prefix == "xml":
820 xmlns = None
821 else:
822 xmlns = ("xmlns:%s" % prefix, namespace_uri)
823 else:
824 xmlns = None
825 return "%s:%s" % (prefix, tag), xmlns
826
827##
828# Parses an XML document into an element tree.
829#
830# @param source A filename or file object containing XML data.
831# @param parser An optional parser instance. If not given, the
832# standard {@link XMLTreeBuilder} parser is used.
833# @return An ElementTree instance
834
835def parse(source, parser=None):
836 tree = ElementTree()
837 tree.parse(source, parser)
838 return tree
839
840##
841# Parses an XML document into an element tree incrementally, and reports
842# what's going on to the user.
843#
844# @param source A filename or file object containing XML data.
845# @param events A list of events to report back. If omitted, only "end"
846# events are reported.
847# @return A (event, elem) iterator.
848
849class iterparse:
850
851 def __init__(self, source, events=None):
852 if not hasattr(source, "read"):
853 source = open(source, "rb")
854 self._file = source
855 self._events = []
856 self._index = 0
857 self.root = self._root = None
858 self._parser = XMLTreeBuilder()
859 # wire up the parser for event reporting
860 parser = self._parser._parser
861 append = self._events.append
862 if events is None:
863 events = ["end"]
864 for event in events:
865 if event == "start":
866 try:
867 parser.ordered_attributes = 1
868 parser.specified_attributes = 1
869 def handler(tag, attrib_in, event=event, append=append,
870 start=self._parser._start_list):
871 append((event, start(tag, attrib_in)))
872 parser.StartElementHandler = handler
873 except AttributeError:
874 def handler(tag, attrib_in, event=event, append=append,
875 start=self._parser._start):
876 append((event, start(tag, attrib_in)))
877 parser.StartElementHandler = handler
878 elif event == "end":
879 def handler(tag, event=event, append=append,
880 end=self._parser._end):
881 append((event, end(tag)))
882 parser.EndElementHandler = handler
883 elif event == "start-ns":
884 def handler(prefix, uri, event=event, append=append):
885 try:
886 uri = _encode(uri, "ascii")
887 except UnicodeError:
888 pass
889 append((event, (prefix or "", uri)))
890 parser.StartNamespaceDeclHandler = handler
891 elif event == "end-ns":
892 def handler(prefix, event=event, append=append):
893 append((event, None))
894 parser.EndNamespaceDeclHandler = handler
895
Georg Brandla18af4e2007-04-21 15:47:16 +0000896 def __next__(self):
Armin Rigo9ed73062005-12-14 18:10:45 +0000897 while 1:
898 try:
899 item = self._events[self._index]
900 except IndexError:
901 if self._parser is None:
902 self.root = self._root
903 try:
904 raise StopIteration
905 except NameError:
906 raise IndexError
907 # load event buffer
908 del self._events[:]
909 self._index = 0
910 data = self._file.read(16384)
911 if data:
912 self._parser.feed(data)
913 else:
914 self._root = self._parser.close()
915 self._parser = None
916 else:
917 self._index = self._index + 1
918 return item
919
920 try:
921 iter
922 def __iter__(self):
923 return self
924 except NameError:
925 def __getitem__(self, index):
Georg Brandla18af4e2007-04-21 15:47:16 +0000926 return self.__next__()
Armin Rigo9ed73062005-12-14 18:10:45 +0000927
928##
929# Parses an XML document from a string constant. This function can
930# be used to embed "XML literals" in Python code.
931#
932# @param source A string containing XML data.
933# @return An Element instance.
934# @defreturn Element
935
936def XML(text):
937 parser = XMLTreeBuilder()
938 parser.feed(text)
939 return parser.close()
940
941##
942# Parses an XML document from a string constant, and also returns
943# a dictionary which maps from element id:s to elements.
944#
945# @param source A string containing XML data.
946# @return A tuple containing an Element instance and a dictionary.
947# @defreturn (Element, dictionary)
948
949def XMLID(text):
950 parser = XMLTreeBuilder()
951 parser.feed(text)
952 tree = parser.close()
953 ids = {}
954 for elem in tree.getiterator():
955 id = elem.get("id")
956 if id:
957 ids[id] = elem
958 return tree, ids
959
960##
961# Parses an XML document from a string constant. Same as {@link #XML}.
962#
963# @def fromstring(text)
964# @param source A string containing XML data.
965# @return An Element instance.
966# @defreturn Element
967
968fromstring = XML
969
970##
971# Generates a string representation of an XML element, including all
972# subelements.
973#
974# @param element An Element instance.
975# @return An encoded string containing the XML data.
976# @defreturn string
977
978def tostring(element, encoding=None):
979 class dummy:
980 pass
981 data = []
982 file = dummy()
983 file.write = data.append
984 ElementTree(element).write(file, encoding)
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000985 return "".join(data)
Armin Rigo9ed73062005-12-14 18:10:45 +0000986
987##
988# Generic element structure builder. This builder converts a sequence
989# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
990# #TreeBuilder.end} method calls to a well-formed element structure.
991# <p>
992# You can use this class to build an element structure using a custom XML
993# parser, or a parser for some other XML-like format.
994#
995# @param element_factory Optional element factory. This factory
996# is called to create new Element instances, as necessary.
997
998class TreeBuilder:
999
1000 def __init__(self, element_factory=None):
1001 self._data = [] # data collector
1002 self._elem = [] # element stack
1003 self._last = None # last element
1004 self._tail = None # true if we're after an end tag
1005 if element_factory is None:
1006 element_factory = _ElementInterface
1007 self._factory = element_factory
1008
1009 ##
1010 # Flushes the parser buffers, and returns the toplevel documen
1011 # element.
1012 #
1013 # @return An Element instance.
1014 # @defreturn Element
1015
1016 def close(self):
1017 assert len(self._elem) == 0, "missing end tags"
1018 assert self._last != None, "missing toplevel element"
1019 return self._last
1020
1021 def _flush(self):
1022 if self._data:
1023 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001024 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001025 if self._tail:
1026 assert self._last.tail is None, "internal error (tail)"
1027 self._last.tail = text
1028 else:
1029 assert self._last.text is None, "internal error (text)"
1030 self._last.text = text
1031 self._data = []
1032
1033 ##
1034 # Adds text to the current element.
1035 #
1036 # @param data A string. This should be either an 8-bit string
1037 # containing ASCII text, or a Unicode string.
1038
1039 def data(self, data):
1040 self._data.append(data)
1041
1042 ##
1043 # Opens a new element.
1044 #
1045 # @param tag The element name.
1046 # @param attrib A dictionary containing element attributes.
1047 # @return The opened element.
1048 # @defreturn Element
1049
1050 def start(self, tag, attrs):
1051 self._flush()
1052 self._last = elem = self._factory(tag, attrs)
1053 if self._elem:
1054 self._elem[-1].append(elem)
1055 self._elem.append(elem)
1056 self._tail = 0
1057 return elem
1058
1059 ##
1060 # Closes the current element.
1061 #
1062 # @param tag The element name.
1063 # @return The closed element.
1064 # @defreturn Element
1065
1066 def end(self, tag):
1067 self._flush()
1068 self._last = self._elem.pop()
1069 assert self._last.tag == tag,\
1070 "end tag mismatch (expected %s, got %s)" % (
1071 self._last.tag, tag)
1072 self._tail = 1
1073 return self._last
1074
1075##
1076# Element structure builder for XML source data, based on the
1077# <b>expat</b> parser.
1078#
1079# @keyparam target Target object. If omitted, the builder uses an
1080# instance of the standard {@link #TreeBuilder} class.
1081# @keyparam html Predefine HTML entities. This flag is not supported
1082# by the current implementation.
1083# @see #ElementTree
1084# @see #TreeBuilder
1085
1086class XMLTreeBuilder:
1087
1088 def __init__(self, html=0, target=None):
1089 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001090 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001091 except ImportError:
1092 raise ImportError(
1093 "No module named expat; use SimpleXMLTreeBuilder instead"
1094 )
1095 self._parser = parser = expat.ParserCreate(None, "}")
1096 if target is None:
1097 target = TreeBuilder()
1098 self._target = target
1099 self._names = {} # name memo cache
1100 # callbacks
1101 parser.DefaultHandlerExpand = self._default
1102 parser.StartElementHandler = self._start
1103 parser.EndElementHandler = self._end
1104 parser.CharacterDataHandler = self._data
1105 # let expat do the buffering, if supported
1106 try:
1107 self._parser.buffer_text = 1
1108 except AttributeError:
1109 pass
1110 # use new-style attribute handling, if supported
1111 try:
1112 self._parser.ordered_attributes = 1
1113 self._parser.specified_attributes = 1
1114 parser.StartElementHandler = self._start_list
1115 except AttributeError:
1116 pass
1117 encoding = None
1118 if not parser.returns_unicode:
1119 encoding = "utf-8"
1120 # target.xml(encoding, None)
1121 self._doctype = None
1122 self.entity = {}
1123
1124 def _fixtext(self, text):
1125 # convert text string to ascii, if possible
1126 try:
1127 return _encode(text, "ascii")
1128 except UnicodeError:
1129 return text
1130
1131 def _fixname(self, key):
1132 # expand qname, and convert name string to ascii, if possible
1133 try:
1134 name = self._names[key]
1135 except KeyError:
1136 name = key
1137 if "}" in name:
1138 name = "{" + name
1139 self._names[key] = name = self._fixtext(name)
1140 return name
1141
1142 def _start(self, tag, attrib_in):
1143 fixname = self._fixname
1144 tag = fixname(tag)
1145 attrib = {}
1146 for key, value in attrib_in.items():
1147 attrib[fixname(key)] = self._fixtext(value)
1148 return self._target.start(tag, attrib)
1149
1150 def _start_list(self, tag, attrib_in):
1151 fixname = self._fixname
1152 tag = fixname(tag)
1153 attrib = {}
1154 if attrib_in:
1155 for i in range(0, len(attrib_in), 2):
1156 attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
1157 return self._target.start(tag, attrib)
1158
1159 def _data(self, text):
1160 return self._target.data(self._fixtext(text))
1161
1162 def _end(self, tag):
1163 return self._target.end(self._fixname(tag))
1164
1165 def _default(self, text):
1166 prefix = text[:1]
1167 if prefix == "&":
1168 # deal with undefined entities
1169 try:
1170 self._target.data(self.entity[text[1:-1]])
1171 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001172 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001173 raise expat.error(
1174 "undefined entity %s: line %d, column %d" %
1175 (text, self._parser.ErrorLineNumber,
1176 self._parser.ErrorColumnNumber)
1177 )
1178 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1179 self._doctype = [] # inside a doctype declaration
1180 elif self._doctype is not None:
1181 # parse doctype contents
1182 if prefix == ">":
1183 self._doctype = None
1184 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001185 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001186 if not text:
1187 return
1188 self._doctype.append(text)
1189 n = len(self._doctype)
1190 if n > 2:
1191 type = self._doctype[1]
1192 if type == "PUBLIC" and n == 4:
1193 name, type, pubid, system = self._doctype
1194 elif type == "SYSTEM" and n == 3:
1195 name, type, system = self._doctype
1196 pubid = None
1197 else:
1198 return
1199 if pubid:
1200 pubid = pubid[1:-1]
1201 self.doctype(name, pubid, system[1:-1])
1202 self._doctype = None
1203
1204 ##
1205 # Handles a doctype declaration.
1206 #
1207 # @param name Doctype name.
1208 # @param pubid Public identifier.
1209 # @param system System identifier.
1210
1211 def doctype(self, name, pubid, system):
1212 pass
1213
1214 ##
1215 # Feeds data to the parser.
1216 #
1217 # @param data Encoded data.
1218
1219 def feed(self, data):
1220 self._parser.Parse(data, 0)
1221
1222 ##
1223 # Finishes feeding data to the parser.
1224 #
1225 # @return An element structure.
1226 # @defreturn Element
1227
1228 def close(self):
1229 self._parser.Parse("", 1) # end of data
1230 tree = self._target.close()
1231 del self._target, self._parser # get rid of circular references
1232 return tree
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001233
1234# compatibility
1235XMLParser = XMLTreeBuilder