blob: 2fba1771b4e4647165f6200a24a66cb751fe849d [file] [log] [blame]
Armin Rigo9ed73062005-12-14 18:10:45 +00001#
2# ElementTree
3# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
4#
5# light-weight XML support for Python 1.5.2 and later.
6#
7# history:
8# 2001-10-20 fl created (from various sources)
9# 2001-11-01 fl return root from parse method
10# 2002-02-16 fl sort attributes in lexical order
11# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
12# 2002-05-01 fl finished TreeBuilder refactoring
13# 2002-07-14 fl added basic namespace support to ElementTree.write
14# 2002-07-25 fl added QName attribute support
15# 2002-10-20 fl fixed encoding in write
16# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
17# 2002-11-27 fl accept file objects or file names for parse/write
18# 2002-12-04 fl moved XMLTreeBuilder back to this module
19# 2003-01-11 fl fixed entity encoding glitch for us-ascii
20# 2003-02-13 fl added XML literal factory
21# 2003-02-21 fl added ProcessingInstruction/PI factory
22# 2003-05-11 fl added tostring/fromstring helpers
23# 2003-05-26 fl added ElementPath support
24# 2003-07-05 fl added makeelement factory method
25# 2003-07-28 fl added more well-known namespace prefixes
26# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
27# 2003-09-04 fl fall back on emulator if ElementPath is not installed
28# 2003-10-31 fl markup updates
29# 2003-11-15 fl fixed nested namespace bug
30# 2004-03-28 fl added XMLID helper
31# 2004-06-02 fl added default support to findtext
32# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
33# 2004-08-23 fl take advantage of post-2.1 expat features
34# 2005-02-01 fl added iterparse implementation
35# 2005-03-02 fl fixed iterparse support for pre-2.2 versions
36#
37# Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved.
38#
39# fredrik@pythonware.com
40# http://www.pythonware.com
41#
42# --------------------------------------------------------------------
43# The ElementTree toolkit is
44#
45# Copyright (c) 1999-2005 by Fredrik Lundh
46#
47# By obtaining, using, and/or copying this software and/or its
48# associated documentation, you agree that you have read, understood,
49# and will comply with the following terms and conditions:
50#
51# Permission to use, copy, modify, and distribute this software and
52# its associated documentation for any purpose and without fee is
53# hereby granted, provided that the above copyright notice appears in
54# all copies, and that both that copyright notice and this permission
55# notice appear in supporting documentation, and that the name of
56# Secret Labs AB or the author not be used in advertising or publicity
57# pertaining to distribution of the software without specific, written
58# prior permission.
59#
60# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
61# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
62# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
63# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
64# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
65# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
66# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
67# OF THIS SOFTWARE.
68# --------------------------------------------------------------------
69
Fredrik Lundh63168a52005-12-14 22:29:34 +000070# Licensed to PSF under a Contributor Agreement.
71# See http://www.python.org/2.4/license for licensing details.
72
Armin Rigo9ed73062005-12-14 18:10:45 +000073__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
78 "fromstring",
79 "iselement", "iterparse",
80 "parse",
81 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
84 "tostring",
85 "TreeBuilder",
86 "VERSION", "XML",
Thomas Wouters0e3f5912006-08-11 14:57:12 +000087 "XMLParser", "XMLTreeBuilder",
Armin Rigo9ed73062005-12-14 18:10:45 +000088 ]
89
90##
91# The <b>Element</b> type is a flexible container object, designed to
92# store hierarchical data structures in memory. The type can be
93# described as a cross between a list and a dictionary.
94# <p>
95# Each element has a number of properties associated with it:
96# <ul>
97# <li>a <i>tag</i>. This is a string identifying what kind of data
98# this element represents (the element type, in other words).</li>
99# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
100# <li>a <i>text</i> string.</li>
101# <li>an optional <i>tail</i> string.</li>
102# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
103# </ul>
104#
105# To create an element instance, use the {@link #Element} or {@link
106# #SubElement} factory functions.
107# <p>
108# The {@link #ElementTree} class can be used to wrap an element
109# structure, and convert it from and to XML.
110##
111
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000112import sys, re
Armin Rigo9ed73062005-12-14 18:10:45 +0000113
Alex Martelli6cefeb02006-08-21 23:45:19 +0000114from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000115
116# TODO: add support for custom namespace resolvers/default namespaces
117# TODO: add improved support for incremental parsing
118
119VERSION = "1.2.6"
120
121##
122# Internal element class. This class defines the Element interface,
123# and provides a reference implementation of this interface.
124# <p>
125# You should not create instances of this class directly. Use the
126# appropriate factory functions instead, such as {@link #Element}
127# and {@link #SubElement}.
128#
129# @see Element
130# @see SubElement
131# @see Comment
132# @see ProcessingInstruction
133
134class _ElementInterface:
135 # <tag attrib>text<child/>...</tag>tail
136
137 ##
138 # (Attribute) Element tag.
139
140 tag = None
141
142 ##
143 # (Attribute) Element attribute dictionary. Where possible, use
144 # {@link #_ElementInterface.get},
145 # {@link #_ElementInterface.set},
146 # {@link #_ElementInterface.keys}, and
147 # {@link #_ElementInterface.items} to access
148 # element attributes.
149
150 attrib = None
151
152 ##
153 # (Attribute) Text before first subelement. This is either a
154 # string or the value None, if there was no text.
155
156 text = None
157
158 ##
159 # (Attribute) Text after this element's end tag, but before the
160 # next sibling element's start tag. This is either a string or
161 # the value None, if there was no text.
162
163 tail = None # text after end tag, if any
164
165 def __init__(self, tag, attrib):
166 self.tag = tag
167 self.attrib = attrib
168 self._children = []
169
170 def __repr__(self):
171 return "<Element %s at %x>" % (self.tag, id(self))
172
173 ##
174 # Creates a new element object of the same type as this element.
175 #
176 # @param tag Element tag.
177 # @param attrib Element attributes, given as a dictionary.
178 # @return A new element instance.
179
180 def makeelement(self, tag, attrib):
181 return Element(tag, attrib)
182
183 ##
184 # Returns the number of subelements.
185 #
186 # @return The number of subelements.
187
188 def __len__(self):
189 return len(self._children)
190
191 ##
192 # Returns the given subelement.
193 #
194 # @param index What subelement to return.
195 # @return The given subelement.
196 # @exception IndexError If the given element does not exist.
197
198 def __getitem__(self, index):
199 return self._children[index]
200
201 ##
202 # Replaces the given subelement.
203 #
204 # @param index What subelement to replace.
205 # @param element The new element value.
206 # @exception IndexError If the given element does not exist.
207 # @exception AssertionError If element is not a valid object.
208
209 def __setitem__(self, index, element):
210 assert iselement(element)
211 self._children[index] = element
212
213 ##
214 # Deletes the given subelement.
215 #
216 # @param index What subelement to delete.
217 # @exception IndexError If the given element does not exist.
218
219 def __delitem__(self, index):
220 del self._children[index]
221
222 ##
223 # Returns a list containing subelements in the given range.
224 #
225 # @param start The first subelement to return.
226 # @param stop The first subelement that shouldn't be returned.
227 # @return A sequence object containing subelements.
228
229 def __getslice__(self, start, stop):
230 return self._children[start:stop]
231
232 ##
233 # Replaces a number of subelements with elements from a sequence.
234 #
235 # @param start The first subelement to replace.
236 # @param stop The first subelement that shouldn't be replaced.
237 # @param elements A sequence object with zero or more elements.
238 # @exception AssertionError If a sequence member is not a valid object.
239
240 def __setslice__(self, start, stop, elements):
241 for element in elements:
242 assert iselement(element)
243 self._children[start:stop] = list(elements)
244
245 ##
246 # Deletes a number of subelements.
247 #
248 # @param start The first subelement to delete.
249 # @param stop The first subelement to leave in there.
250
251 def __delslice__(self, start, stop):
252 del self._children[start:stop]
253
254 ##
255 # Adds a subelement to the end of this element.
256 #
257 # @param element The element to add.
258 # @exception AssertionError If a sequence member is not a valid object.
259
260 def append(self, element):
261 assert iselement(element)
262 self._children.append(element)
263
264 ##
265 # Inserts a subelement at the given position in this element.
266 #
267 # @param index Where to insert the new subelement.
268 # @exception AssertionError If the element is not a valid object.
269
270 def insert(self, index, element):
271 assert iselement(element)
272 self._children.insert(index, element)
273
274 ##
275 # Removes a matching subelement. Unlike the <b>find</b> methods,
276 # this method compares elements based on identity, not on tag
277 # value or contents.
278 #
279 # @param element What element to remove.
280 # @exception ValueError If a matching element could not be found.
281 # @exception AssertionError If the element is not a valid object.
282
283 def remove(self, element):
284 assert iselement(element)
285 self._children.remove(element)
286
287 ##
288 # Returns all subelements. The elements are returned in document
289 # order.
290 #
291 # @return A list of subelements.
292 # @defreturn list of Element instances
293
294 def getchildren(self):
295 return self._children
296
297 ##
298 # Finds the first matching subelement, by tag name or path.
299 #
300 # @param path What element to look for.
301 # @return The first matching element, or None if no element was found.
302 # @defreturn Element or None
303
304 def find(self, path):
305 return ElementPath.find(self, path)
306
307 ##
308 # Finds text for the first matching subelement, by tag name or path.
309 #
310 # @param path What element to look for.
311 # @param default What to return if the element was not found.
312 # @return The text content of the first matching element, or the
313 # default value no element was found. Note that if the element
314 # has is found, but has no text content, this method returns an
315 # empty string.
316 # @defreturn string
317
318 def findtext(self, path, default=None):
319 return ElementPath.findtext(self, path, default)
320
321 ##
322 # Finds all matching subelements, by tag name or path.
323 #
324 # @param path What element to look for.
325 # @return A list or iterator containing all matching elements,
326 # in document order.
327 # @defreturn list of Element instances
328
329 def findall(self, path):
330 return ElementPath.findall(self, path)
331
332 ##
333 # Resets an element. This function removes all subelements, clears
334 # all attributes, and sets the text and tail attributes to None.
335
336 def clear(self):
337 self.attrib.clear()
338 self._children = []
339 self.text = self.tail = None
340
341 ##
342 # Gets an element attribute.
343 #
344 # @param key What attribute to look for.
345 # @param default What to return if the attribute was not found.
346 # @return The attribute value, or the default value, if the
347 # attribute was not found.
348 # @defreturn string or None
349
350 def get(self, key, default=None):
351 return self.attrib.get(key, default)
352
353 ##
354 # Sets an element attribute.
355 #
356 # @param key What attribute to set.
357 # @param value The attribute value.
358
359 def set(self, key, value):
360 self.attrib[key] = value
361
362 ##
363 # Gets a list of attribute names. The names are returned in an
364 # arbitrary order (just like for an ordinary Python dictionary).
365 #
366 # @return A list of element attribute names.
367 # @defreturn list of strings
368
369 def keys(self):
370 return self.attrib.keys()
371
372 ##
373 # Gets element attributes, as a sequence. The attributes are
374 # returned in an arbitrary order.
375 #
376 # @return A list of (name, value) tuples for all attributes.
377 # @defreturn list of (string, string) tuples
378
379 def items(self):
380 return self.attrib.items()
381
382 ##
383 # Creates a tree iterator. The iterator loops over this element
384 # and all subelements, in document order, and returns all elements
385 # with a matching tag.
386 # <p>
387 # If the tree structure is modified during iteration, the result
388 # is undefined.
389 #
390 # @param tag What tags to look for (default is to return all elements).
391 # @return A list or iterator containing all the matching elements.
392 # @defreturn list or iterator
393
394 def getiterator(self, tag=None):
395 nodes = []
396 if tag == "*":
397 tag = None
398 if tag is None or self.tag == tag:
399 nodes.append(self)
400 for node in self._children:
401 nodes.extend(node.getiterator(tag))
402 return nodes
403
404# compatibility
405_Element = _ElementInterface
406
407##
408# Element factory. This function returns an object implementing the
409# standard Element interface. The exact class or type of that object
410# is implementation dependent, but it will always be compatible with
411# the {@link #_ElementInterface} class in this module.
412# <p>
413# The element name, attribute names, and attribute values can be
414# either 8-bit ASCII strings or Unicode strings.
415#
416# @param tag The element name.
417# @param attrib An optional dictionary, containing element attributes.
418# @param **extra Additional attributes, given as keyword arguments.
419# @return An element instance.
420# @defreturn Element
421
422def Element(tag, attrib={}, **extra):
423 attrib = attrib.copy()
424 attrib.update(extra)
425 return _ElementInterface(tag, attrib)
426
427##
428# Subelement factory. This function creates an element instance, and
429# appends it to an existing element.
430# <p>
431# The element name, attribute names, and attribute values can be
432# either 8-bit ASCII strings or Unicode strings.
433#
434# @param parent The parent element.
435# @param tag The subelement name.
436# @param attrib An optional dictionary, containing element attributes.
437# @param **extra Additional attributes, given as keyword arguments.
438# @return An element instance.
439# @defreturn Element
440
441def SubElement(parent, tag, attrib={}, **extra):
442 attrib = attrib.copy()
443 attrib.update(extra)
444 element = parent.makeelement(tag, attrib)
445 parent.append(element)
446 return element
447
448##
449# Comment element factory. This factory function creates a special
450# element that will be serialized as an XML comment.
451# <p>
452# The comment string can be either an 8-bit ASCII string or a Unicode
453# string.
454#
455# @param text A string containing the comment string.
456# @return An element instance, representing a comment.
457# @defreturn Element
458
459def Comment(text=None):
460 element = Element(Comment)
461 element.text = text
462 return element
463
464##
465# PI element factory. This factory function creates a special element
466# that will be serialized as an XML processing instruction.
467#
468# @param target A string containing the PI target.
469# @param text A string containing the PI contents, if any.
470# @return An element instance, representing a PI.
471# @defreturn Element
472
473def ProcessingInstruction(target, text=None):
474 element = Element(ProcessingInstruction)
475 element.text = target
476 if text:
477 element.text = element.text + " " + text
478 return element
479
480PI = ProcessingInstruction
481
482##
483# QName wrapper. This can be used to wrap a QName attribute value, in
484# order to get proper namespace handling on output.
485#
486# @param text A string containing the QName value, in the form {uri}local,
487# or, if the tag argument is given, the URI part of a QName.
488# @param tag Optional tag. If given, the first argument is interpreted as
489# an URI, and this argument is interpreted as a local name.
490# @return An opaque object, representing the QName.
491
492class QName:
493 def __init__(self, text_or_uri, tag=None):
494 if tag:
495 text_or_uri = "{%s}%s" % (text_or_uri, tag)
496 self.text = text_or_uri
497 def __str__(self):
498 return self.text
499 def __hash__(self):
500 return hash(self.text)
501 def __cmp__(self, other):
502 if isinstance(other, QName):
503 return cmp(self.text, other.text)
504 return cmp(self.text, other)
505
506##
507# ElementTree wrapper class. This class represents an entire element
508# hierarchy, and adds some extra support for serialization to and from
509# standard XML.
510#
511# @param element Optional root element.
512# @keyparam file Optional file handle or name. If given, the
513# tree is initialized with the contents of this XML file.
514
515class ElementTree:
516
517 def __init__(self, element=None, file=None):
518 assert element is None or iselement(element)
519 self._root = element # first node
520 if file:
521 self.parse(file)
522
523 ##
524 # Gets the root element for this tree.
525 #
526 # @return An element instance.
527 # @defreturn Element
528
529 def getroot(self):
530 return self._root
531
532 ##
533 # Replaces the root element for this tree. This discards the
534 # current contents of the tree, and replaces it with the given
535 # element. Use with care.
536 #
537 # @param element An element instance.
538
539 def _setroot(self, element):
540 assert iselement(element)
541 self._root = element
542
543 ##
544 # Loads an external XML document into this element tree.
545 #
546 # @param source A file name or file object.
547 # @param parser An optional parser instance. If not given, the
548 # standard {@link XMLTreeBuilder} parser is used.
549 # @return The document root element.
550 # @defreturn Element
551
552 def parse(self, source, parser=None):
553 if not hasattr(source, "read"):
554 source = open(source, "rb")
555 if not parser:
556 parser = XMLTreeBuilder()
557 while 1:
558 data = source.read(32768)
559 if not data:
560 break
561 parser.feed(data)
562 self._root = parser.close()
563 return self._root
564
565 ##
566 # Creates a tree iterator for the root element. The iterator loops
567 # over all elements in this tree, in document order.
568 #
569 # @param tag What tags to look for (default is to return all elements)
570 # @return An iterator.
571 # @defreturn iterator
572
573 def getiterator(self, tag=None):
574 assert self._root is not None
575 return self._root.getiterator(tag)
576
577 ##
578 # Finds the first toplevel element with given tag.
579 # Same as getroot().find(path).
580 #
581 # @param path What element to look for.
582 # @return The first matching element, or None if no element was found.
583 # @defreturn Element or None
584
585 def find(self, path):
586 assert self._root is not None
587 if path[:1] == "/":
588 path = "." + path
589 return self._root.find(path)
590
591 ##
592 # Finds the element text for the first toplevel element with given
593 # tag. Same as getroot().findtext(path).
594 #
595 # @param path What toplevel element to look for.
596 # @param default What to return if the element was not found.
597 # @return The text content of the first matching element, or the
598 # default value no element was found. Note that if the element
599 # has is found, but has no text content, this method returns an
600 # empty string.
601 # @defreturn string
602
603 def findtext(self, path, default=None):
604 assert self._root is not None
605 if path[:1] == "/":
606 path = "." + path
607 return self._root.findtext(path, default)
608
609 ##
610 # Finds all toplevel elements with the given tag.
611 # Same as getroot().findall(path).
612 #
613 # @param path What element to look for.
614 # @return A list or iterator containing all matching elements,
615 # in document order.
616 # @defreturn list of Element instances
617
618 def findall(self, path):
619 assert self._root is not None
620 if path[:1] == "/":
621 path = "." + path
622 return self._root.findall(path)
623
624 ##
625 # Writes the element tree to a file, as XML.
626 #
627 # @param file A file name, or a file object opened for writing.
628 # @param encoding Optional output encoding (default is US-ASCII).
629
630 def write(self, file, encoding="us-ascii"):
631 assert self._root is not None
632 if not hasattr(file, "write"):
633 file = open(file, "wb")
634 if not encoding:
635 encoding = "us-ascii"
636 elif encoding != "utf-8" and encoding != "us-ascii":
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000637 file.write(_encode("<?xml version='1.0' encoding='%s'?>\n" % encoding, encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000638 self._write(file, self._root, encoding, {})
639
640 def _write(self, file, node, encoding, namespaces):
641 # write XML to file
642 tag = node.tag
643 if tag is Comment:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000644 file.write(_encode("<!-- %s -->" % _escape_cdata(node.text), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000645 elif tag is ProcessingInstruction:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000646 file.write(_encode("<?%s?>" % _escape_cdata(node.text), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000647 else:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000648 items = list(node.items())
Armin Rigo9ed73062005-12-14 18:10:45 +0000649 xmlns_items = [] # new namespaces in this scope
650 try:
651 if isinstance(tag, QName) or tag[:1] == "{":
652 tag, xmlns = fixtag(tag, namespaces)
653 if xmlns: xmlns_items.append(xmlns)
654 except TypeError:
655 _raise_serialization_error(tag)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000656 file.write(_encode("<" + tag, encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000657 if items or xmlns_items:
658 items.sort() # lexical order
659 for k, v in items:
660 try:
661 if isinstance(k, QName) or k[:1] == "{":
662 k, xmlns = fixtag(k, namespaces)
663 if xmlns: xmlns_items.append(xmlns)
664 except TypeError:
665 _raise_serialization_error(k)
666 try:
667 if isinstance(v, QName):
668 v, xmlns = fixtag(v, namespaces)
669 if xmlns: xmlns_items.append(xmlns)
670 except TypeError:
671 _raise_serialization_error(v)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000672 file.write(_encode(" %s=\"%s\"" % (k, _escape_attrib(v)), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000673 for k, v in xmlns_items:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000674 file.write(_encode(" %s=\"%s\"" % (k, _escape_attrib(v)), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000675 if node.text or len(node):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000676 file.write(_encode(">", encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000677 if node.text:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000678 file.write(_encode(_escape_cdata(node.text), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000679 for n in node:
680 self._write(file, n, encoding, namespaces)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000681 file.write(_encode("</" + tag + ">", encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000682 else:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000683 file.write(_encode(" />", encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000684 for k, v in xmlns_items:
685 del namespaces[v]
686 if node.tail:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000687 file.write(_encode(_escape_cdata(node.tail), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000688
689# --------------------------------------------------------------------
690# helpers
691
692##
693# Checks if an object appears to be a valid element object.
694#
695# @param An element instance.
696# @return A true value if this is an element object.
697# @defreturn flag
698
699def iselement(element):
700 # FIXME: not sure about this; might be a better idea to look
701 # for tag/attrib/text attributes
702 return isinstance(element, _ElementInterface) or hasattr(element, "tag")
703
704##
705# Writes an element tree or element structure to sys.stdout. This
706# function should be used for debugging only.
707# <p>
708# The exact output format is implementation dependent. In this
709# version, it's written as an ordinary XML file.
710#
711# @param elem An element tree or an individual element.
712
713def dump(elem):
714 # debugging
715 if not isinstance(elem, ElementTree):
716 elem = ElementTree(elem)
717 elem.write(sys.stdout)
718 tail = elem.getroot().tail
719 if not tail or tail[-1] != "\n":
720 sys.stdout.write("\n")
721
722def _encode(s, encoding):
723 try:
724 return s.encode(encoding)
725 except AttributeError:
726 return s # 1.5.2: assume the string uses the right encoding
727
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000728_escape = re.compile(r"[&<>\"\u0080-\uffff]+")
Armin Rigo9ed73062005-12-14 18:10:45 +0000729
730_escape_map = {
731 "&": "&amp;",
732 "<": "&lt;",
733 ">": "&gt;",
734 '"': "&quot;",
735}
736
737_namespace_map = {
738 # "well-known" namespace prefixes
739 "http://www.w3.org/XML/1998/namespace": "xml",
740 "http://www.w3.org/1999/xhtml": "html",
741 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
742 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
743}
744
745def _raise_serialization_error(text):
746 raise TypeError(
747 "cannot serialize %r (type %s)" % (text, type(text).__name__)
748 )
749
750def _encode_entity(text, pattern=_escape):
751 # map reserved and non-ascii characters to numerical entities
752 def escape_entities(m, map=_escape_map):
753 out = []
754 append = out.append
755 for char in m.group():
756 text = map.get(char)
757 if text is None:
758 text = "&#%d;" % ord(char)
759 append(text)
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000760 return "".join(out)
Armin Rigo9ed73062005-12-14 18:10:45 +0000761 try:
762 return _encode(pattern.sub(escape_entities, text), "ascii")
763 except TypeError:
764 _raise_serialization_error(text)
765
766#
767# the following functions assume an ascii-compatible encoding
768# (or "utf-16")
769
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000770def _escape_cdata(text):
Armin Rigo9ed73062005-12-14 18:10:45 +0000771 # escape character data
772 try:
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000773 text = text.replace("&", "&amp;")
774 text = text.replace("<", "&lt;")
775 text = text.replace(">", "&gt;")
Armin Rigo9ed73062005-12-14 18:10:45 +0000776 return text
777 except (TypeError, AttributeError):
778 _raise_serialization_error(text)
779
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000780def _escape_attrib(text):
Armin Rigo9ed73062005-12-14 18:10:45 +0000781 # escape attribute value
782 try:
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000783 text = text.replace("&", "&amp;")
784 text = text.replace("'", "&apos;") # FIXME: overkill
785 text = text.replace("\"", "&quot;")
786 text = text.replace("<", "&lt;")
787 text = text.replace(">", "&gt;")
Armin Rigo9ed73062005-12-14 18:10:45 +0000788 return text
789 except (TypeError, AttributeError):
790 _raise_serialization_error(text)
791
792def fixtag(tag, namespaces):
793 # given a decorated tag (of the form {uri}tag), return prefixed
794 # tag and namespace declaration, if any
795 if isinstance(tag, QName):
796 tag = tag.text
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000797 namespace_uri, tag = tag[1:].split("}", 1)
Armin Rigo9ed73062005-12-14 18:10:45 +0000798 prefix = namespaces.get(namespace_uri)
799 if prefix is None:
800 prefix = _namespace_map.get(namespace_uri)
801 if prefix is None:
802 prefix = "ns%d" % len(namespaces)
803 namespaces[namespace_uri] = prefix
804 if prefix == "xml":
805 xmlns = None
806 else:
807 xmlns = ("xmlns:%s" % prefix, namespace_uri)
808 else:
809 xmlns = None
810 return "%s:%s" % (prefix, tag), xmlns
811
812##
813# Parses an XML document into an element tree.
814#
815# @param source A filename or file object containing XML data.
816# @param parser An optional parser instance. If not given, the
817# standard {@link XMLTreeBuilder} parser is used.
818# @return An ElementTree instance
819
820def parse(source, parser=None):
821 tree = ElementTree()
822 tree.parse(source, parser)
823 return tree
824
825##
826# Parses an XML document into an element tree incrementally, and reports
827# what's going on to the user.
828#
829# @param source A filename or file object containing XML data.
830# @param events A list of events to report back. If omitted, only "end"
831# events are reported.
832# @return A (event, elem) iterator.
833
834class iterparse:
835
836 def __init__(self, source, events=None):
837 if not hasattr(source, "read"):
838 source = open(source, "rb")
839 self._file = source
840 self._events = []
841 self._index = 0
842 self.root = self._root = None
843 self._parser = XMLTreeBuilder()
844 # wire up the parser for event reporting
845 parser = self._parser._parser
846 append = self._events.append
847 if events is None:
848 events = ["end"]
849 for event in events:
850 if event == "start":
851 try:
852 parser.ordered_attributes = 1
853 parser.specified_attributes = 1
854 def handler(tag, attrib_in, event=event, append=append,
855 start=self._parser._start_list):
856 append((event, start(tag, attrib_in)))
857 parser.StartElementHandler = handler
858 except AttributeError:
859 def handler(tag, attrib_in, event=event, append=append,
860 start=self._parser._start):
861 append((event, start(tag, attrib_in)))
862 parser.StartElementHandler = handler
863 elif event == "end":
864 def handler(tag, event=event, append=append,
865 end=self._parser._end):
866 append((event, end(tag)))
867 parser.EndElementHandler = handler
868 elif event == "start-ns":
869 def handler(prefix, uri, event=event, append=append):
870 try:
871 uri = _encode(uri, "ascii")
872 except UnicodeError:
873 pass
874 append((event, (prefix or "", uri)))
875 parser.StartNamespaceDeclHandler = handler
876 elif event == "end-ns":
877 def handler(prefix, event=event, append=append):
878 append((event, None))
879 parser.EndNamespaceDeclHandler = handler
880
Georg Brandla18af4e2007-04-21 15:47:16 +0000881 def __next__(self):
Armin Rigo9ed73062005-12-14 18:10:45 +0000882 while 1:
883 try:
884 item = self._events[self._index]
885 except IndexError:
886 if self._parser is None:
887 self.root = self._root
888 try:
889 raise StopIteration
890 except NameError:
891 raise IndexError
892 # load event buffer
893 del self._events[:]
894 self._index = 0
895 data = self._file.read(16384)
896 if data:
897 self._parser.feed(data)
898 else:
899 self._root = self._parser.close()
900 self._parser = None
901 else:
902 self._index = self._index + 1
903 return item
904
905 try:
906 iter
907 def __iter__(self):
908 return self
909 except NameError:
910 def __getitem__(self, index):
Georg Brandla18af4e2007-04-21 15:47:16 +0000911 return self.__next__()
Armin Rigo9ed73062005-12-14 18:10:45 +0000912
913##
914# Parses an XML document from a string constant. This function can
915# be used to embed "XML literals" in Python code.
916#
917# @param source A string containing XML data.
918# @return An Element instance.
919# @defreturn Element
920
921def XML(text):
922 parser = XMLTreeBuilder()
923 parser.feed(text)
924 return parser.close()
925
926##
927# Parses an XML document from a string constant, and also returns
928# a dictionary which maps from element id:s to elements.
929#
930# @param source A string containing XML data.
931# @return A tuple containing an Element instance and a dictionary.
932# @defreturn (Element, dictionary)
933
934def XMLID(text):
935 parser = XMLTreeBuilder()
936 parser.feed(text)
937 tree = parser.close()
938 ids = {}
939 for elem in tree.getiterator():
940 id = elem.get("id")
941 if id:
942 ids[id] = elem
943 return tree, ids
944
945##
946# Parses an XML document from a string constant. Same as {@link #XML}.
947#
948# @def fromstring(text)
949# @param source A string containing XML data.
950# @return An Element instance.
951# @defreturn Element
952
953fromstring = XML
954
955##
956# Generates a string representation of an XML element, including all
957# subelements.
958#
959# @param element An Element instance.
960# @return An encoded string containing the XML data.
961# @defreturn string
962
963def tostring(element, encoding=None):
964 class dummy:
965 pass
966 data = []
967 file = dummy()
968 file.write = data.append
969 ElementTree(element).write(file, encoding)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000970 return b"".join(data)
Armin Rigo9ed73062005-12-14 18:10:45 +0000971
972##
973# Generic element structure builder. This builder converts a sequence
974# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
975# #TreeBuilder.end} method calls to a well-formed element structure.
976# <p>
977# You can use this class to build an element structure using a custom XML
978# parser, or a parser for some other XML-like format.
979#
980# @param element_factory Optional element factory. This factory
981# is called to create new Element instances, as necessary.
982
983class TreeBuilder:
984
985 def __init__(self, element_factory=None):
986 self._data = [] # data collector
987 self._elem = [] # element stack
988 self._last = None # last element
989 self._tail = None # true if we're after an end tag
990 if element_factory is None:
991 element_factory = _ElementInterface
992 self._factory = element_factory
993
994 ##
995 # Flushes the parser buffers, and returns the toplevel documen
996 # element.
997 #
998 # @return An Element instance.
999 # @defreturn Element
1000
1001 def close(self):
1002 assert len(self._elem) == 0, "missing end tags"
1003 assert self._last != None, "missing toplevel element"
1004 return self._last
1005
1006 def _flush(self):
1007 if self._data:
1008 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001009 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001010 if self._tail:
1011 assert self._last.tail is None, "internal error (tail)"
1012 self._last.tail = text
1013 else:
1014 assert self._last.text is None, "internal error (text)"
1015 self._last.text = text
1016 self._data = []
1017
1018 ##
1019 # Adds text to the current element.
1020 #
1021 # @param data A string. This should be either an 8-bit string
1022 # containing ASCII text, or a Unicode string.
1023
1024 def data(self, data):
1025 self._data.append(data)
1026
1027 ##
1028 # Opens a new element.
1029 #
1030 # @param tag The element name.
1031 # @param attrib A dictionary containing element attributes.
1032 # @return The opened element.
1033 # @defreturn Element
1034
1035 def start(self, tag, attrs):
1036 self._flush()
1037 self._last = elem = self._factory(tag, attrs)
1038 if self._elem:
1039 self._elem[-1].append(elem)
1040 self._elem.append(elem)
1041 self._tail = 0
1042 return elem
1043
1044 ##
1045 # Closes the current element.
1046 #
1047 # @param tag The element name.
1048 # @return The closed element.
1049 # @defreturn Element
1050
1051 def end(self, tag):
1052 self._flush()
1053 self._last = self._elem.pop()
1054 assert self._last.tag == tag,\
1055 "end tag mismatch (expected %s, got %s)" % (
1056 self._last.tag, tag)
1057 self._tail = 1
1058 return self._last
1059
1060##
1061# Element structure builder for XML source data, based on the
1062# <b>expat</b> parser.
1063#
1064# @keyparam target Target object. If omitted, the builder uses an
1065# instance of the standard {@link #TreeBuilder} class.
1066# @keyparam html Predefine HTML entities. This flag is not supported
1067# by the current implementation.
1068# @see #ElementTree
1069# @see #TreeBuilder
1070
1071class XMLTreeBuilder:
1072
1073 def __init__(self, html=0, target=None):
1074 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001075 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001076 except ImportError:
1077 raise ImportError(
1078 "No module named expat; use SimpleXMLTreeBuilder instead"
1079 )
1080 self._parser = parser = expat.ParserCreate(None, "}")
1081 if target is None:
1082 target = TreeBuilder()
1083 self._target = target
1084 self._names = {} # name memo cache
1085 # callbacks
1086 parser.DefaultHandlerExpand = self._default
1087 parser.StartElementHandler = self._start
1088 parser.EndElementHandler = self._end
1089 parser.CharacterDataHandler = self._data
1090 # let expat do the buffering, if supported
1091 try:
1092 self._parser.buffer_text = 1
1093 except AttributeError:
1094 pass
1095 # use new-style attribute handling, if supported
1096 try:
1097 self._parser.ordered_attributes = 1
1098 self._parser.specified_attributes = 1
1099 parser.StartElementHandler = self._start_list
1100 except AttributeError:
1101 pass
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001102 encoding = "utf-8"
Armin Rigo9ed73062005-12-14 18:10:45 +00001103 # target.xml(encoding, None)
1104 self._doctype = None
1105 self.entity = {}
1106
Armin Rigo9ed73062005-12-14 18:10:45 +00001107 def _fixname(self, key):
1108 # expand qname, and convert name string to ascii, if possible
1109 try:
1110 name = self._names[key]
1111 except KeyError:
1112 name = key
1113 if "}" in name:
1114 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001115 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001116 return name
1117
1118 def _start(self, tag, attrib_in):
1119 fixname = self._fixname
1120 tag = fixname(tag)
1121 attrib = {}
1122 for key, value in attrib_in.items():
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001123 attrib[fixname(key)] = value
Armin Rigo9ed73062005-12-14 18:10:45 +00001124 return self._target.start(tag, attrib)
1125
1126 def _start_list(self, tag, attrib_in):
1127 fixname = self._fixname
1128 tag = fixname(tag)
1129 attrib = {}
1130 if attrib_in:
1131 for i in range(0, len(attrib_in), 2):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001132 attrib[fixname(attrib_in[i])] = attrib_in[i+1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001133 return self._target.start(tag, attrib)
1134
1135 def _data(self, text):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001136 return self._target.data(text)
Armin Rigo9ed73062005-12-14 18:10:45 +00001137
1138 def _end(self, tag):
1139 return self._target.end(self._fixname(tag))
1140
1141 def _default(self, text):
1142 prefix = text[:1]
1143 if prefix == "&":
1144 # deal with undefined entities
1145 try:
1146 self._target.data(self.entity[text[1:-1]])
1147 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001148 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001149 raise expat.error(
1150 "undefined entity %s: line %d, column %d" %
1151 (text, self._parser.ErrorLineNumber,
1152 self._parser.ErrorColumnNumber)
1153 )
1154 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1155 self._doctype = [] # inside a doctype declaration
1156 elif self._doctype is not None:
1157 # parse doctype contents
1158 if prefix == ">":
1159 self._doctype = None
1160 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001161 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001162 if not text:
1163 return
1164 self._doctype.append(text)
1165 n = len(self._doctype)
1166 if n > 2:
1167 type = self._doctype[1]
1168 if type == "PUBLIC" and n == 4:
1169 name, type, pubid, system = self._doctype
1170 elif type == "SYSTEM" and n == 3:
1171 name, type, system = self._doctype
1172 pubid = None
1173 else:
1174 return
1175 if pubid:
1176 pubid = pubid[1:-1]
1177 self.doctype(name, pubid, system[1:-1])
1178 self._doctype = None
1179
1180 ##
1181 # Handles a doctype declaration.
1182 #
1183 # @param name Doctype name.
1184 # @param pubid Public identifier.
1185 # @param system System identifier.
1186
1187 def doctype(self, name, pubid, system):
1188 pass
1189
1190 ##
1191 # Feeds data to the parser.
1192 #
1193 # @param data Encoded data.
1194
1195 def feed(self, data):
1196 self._parser.Parse(data, 0)
1197
1198 ##
1199 # Finishes feeding data to the parser.
1200 #
1201 # @return An element structure.
1202 # @defreturn Element
1203
1204 def close(self):
1205 self._parser.Parse("", 1) # end of data
1206 tree = self._target.close()
1207 del self._target, self._parser # get rid of circular references
1208 return tree
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001209
1210# compatibility
1211XMLParser = XMLTreeBuilder