blob: c47573e313637a38168a5c993963e2782db4b4be [file] [log] [blame]
Armin Rigo9ed73062005-12-14 18:10:45 +00001#
2# ElementTree
3# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
4#
5# light-weight XML support for Python 1.5.2 and later.
6#
7# history:
8# 2001-10-20 fl created (from various sources)
9# 2001-11-01 fl return root from parse method
10# 2002-02-16 fl sort attributes in lexical order
11# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
12# 2002-05-01 fl finished TreeBuilder refactoring
13# 2002-07-14 fl added basic namespace support to ElementTree.write
14# 2002-07-25 fl added QName attribute support
15# 2002-10-20 fl fixed encoding in write
16# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
17# 2002-11-27 fl accept file objects or file names for parse/write
18# 2002-12-04 fl moved XMLTreeBuilder back to this module
19# 2003-01-11 fl fixed entity encoding glitch for us-ascii
20# 2003-02-13 fl added XML literal factory
21# 2003-02-21 fl added ProcessingInstruction/PI factory
22# 2003-05-11 fl added tostring/fromstring helpers
23# 2003-05-26 fl added ElementPath support
24# 2003-07-05 fl added makeelement factory method
25# 2003-07-28 fl added more well-known namespace prefixes
26# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
27# 2003-09-04 fl fall back on emulator if ElementPath is not installed
28# 2003-10-31 fl markup updates
29# 2003-11-15 fl fixed nested namespace bug
30# 2004-03-28 fl added XMLID helper
31# 2004-06-02 fl added default support to findtext
32# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
33# 2004-08-23 fl take advantage of post-2.1 expat features
34# 2005-02-01 fl added iterparse implementation
35# 2005-03-02 fl fixed iterparse support for pre-2.2 versions
36#
37# Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved.
38#
39# fredrik@pythonware.com
40# http://www.pythonware.com
41#
42# --------------------------------------------------------------------
43# The ElementTree toolkit is
44#
45# Copyright (c) 1999-2005 by Fredrik Lundh
46#
47# By obtaining, using, and/or copying this software and/or its
48# associated documentation, you agree that you have read, understood,
49# and will comply with the following terms and conditions:
50#
51# Permission to use, copy, modify, and distribute this software and
52# its associated documentation for any purpose and without fee is
53# hereby granted, provided that the above copyright notice appears in
54# all copies, and that both that copyright notice and this permission
55# notice appear in supporting documentation, and that the name of
56# Secret Labs AB or the author not be used in advertising or publicity
57# pertaining to distribution of the software without specific, written
58# prior permission.
59#
60# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
61# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
62# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
63# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
64# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
65# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
66# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
67# OF THIS SOFTWARE.
68# --------------------------------------------------------------------
69
Fredrik Lundh63168a52005-12-14 22:29:34 +000070# Licensed to PSF under a Contributor Agreement.
71# See http://www.python.org/2.4/license for licensing details.
72
Armin Rigo9ed73062005-12-14 18:10:45 +000073__all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
78 "fromstring",
79 "iselement", "iterparse",
80 "parse",
81 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
84 "tostring",
85 "TreeBuilder",
86 "VERSION", "XML",
Thomas Wouters0e3f5912006-08-11 14:57:12 +000087 "XMLParser", "XMLTreeBuilder",
Armin Rigo9ed73062005-12-14 18:10:45 +000088 ]
89
90##
91# The <b>Element</b> type is a flexible container object, designed to
92# store hierarchical data structures in memory. The type can be
93# described as a cross between a list and a dictionary.
94# <p>
95# Each element has a number of properties associated with it:
96# <ul>
97# <li>a <i>tag</i>. This is a string identifying what kind of data
98# this element represents (the element type, in other words).</li>
99# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
100# <li>a <i>text</i> string.</li>
101# <li>an optional <i>tail</i> string.</li>
102# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
103# </ul>
104#
105# To create an element instance, use the {@link #Element} or {@link
106# #SubElement} factory functions.
107# <p>
108# The {@link #ElementTree} class can be used to wrap an element
109# structure, and convert it from and to XML.
110##
111
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000112import sys, re
Armin Rigo9ed73062005-12-14 18:10:45 +0000113
Alex Martelli6cefeb02006-08-21 23:45:19 +0000114from . import ElementPath
Armin Rigo9ed73062005-12-14 18:10:45 +0000115
116# TODO: add support for custom namespace resolvers/default namespaces
117# TODO: add improved support for incremental parsing
118
119VERSION = "1.2.6"
120
121##
122# Internal element class. This class defines the Element interface,
123# and provides a reference implementation of this interface.
124# <p>
125# You should not create instances of this class directly. Use the
126# appropriate factory functions instead, such as {@link #Element}
127# and {@link #SubElement}.
128#
129# @see Element
130# @see SubElement
131# @see Comment
132# @see ProcessingInstruction
133
134class _ElementInterface:
135 # <tag attrib>text<child/>...</tag>tail
136
137 ##
138 # (Attribute) Element tag.
139
140 tag = None
141
142 ##
143 # (Attribute) Element attribute dictionary. Where possible, use
144 # {@link #_ElementInterface.get},
145 # {@link #_ElementInterface.set},
146 # {@link #_ElementInterface.keys}, and
147 # {@link #_ElementInterface.items} to access
148 # element attributes.
149
150 attrib = None
151
152 ##
153 # (Attribute) Text before first subelement. This is either a
154 # string or the value None, if there was no text.
155
156 text = None
157
158 ##
159 # (Attribute) Text after this element's end tag, but before the
160 # next sibling element's start tag. This is either a string or
161 # the value None, if there was no text.
162
163 tail = None # text after end tag, if any
164
165 def __init__(self, tag, attrib):
166 self.tag = tag
167 self.attrib = attrib
168 self._children = []
169
170 def __repr__(self):
171 return "<Element %s at %x>" % (self.tag, id(self))
172
173 ##
174 # Creates a new element object of the same type as this element.
175 #
176 # @param tag Element tag.
177 # @param attrib Element attributes, given as a dictionary.
178 # @return A new element instance.
179
180 def makeelement(self, tag, attrib):
181 return Element(tag, attrib)
182
183 ##
184 # Returns the number of subelements.
185 #
186 # @return The number of subelements.
187
188 def __len__(self):
189 return len(self._children)
190
191 ##
192 # Returns the given subelement.
193 #
194 # @param index What subelement to return.
195 # @return The given subelement.
196 # @exception IndexError If the given element does not exist.
197
198 def __getitem__(self, index):
199 return self._children[index]
200
201 ##
202 # Replaces the given subelement.
203 #
204 # @param index What subelement to replace.
205 # @param element The new element value.
206 # @exception IndexError If the given element does not exist.
207 # @exception AssertionError If element is not a valid object.
208
209 def __setitem__(self, index, element):
210 assert iselement(element)
211 self._children[index] = element
212
213 ##
214 # Deletes the given subelement.
215 #
216 # @param index What subelement to delete.
217 # @exception IndexError If the given element does not exist.
218
219 def __delitem__(self, index):
220 del self._children[index]
221
222 ##
223 # Returns a list containing subelements in the given range.
224 #
225 # @param start The first subelement to return.
226 # @param stop The first subelement that shouldn't be returned.
227 # @return A sequence object containing subelements.
228
229 def __getslice__(self, start, stop):
230 return self._children[start:stop]
231
232 ##
233 # Replaces a number of subelements with elements from a sequence.
234 #
235 # @param start The first subelement to replace.
236 # @param stop The first subelement that shouldn't be replaced.
237 # @param elements A sequence object with zero or more elements.
238 # @exception AssertionError If a sequence member is not a valid object.
239
240 def __setslice__(self, start, stop, elements):
241 for element in elements:
242 assert iselement(element)
243 self._children[start:stop] = list(elements)
244
245 ##
246 # Deletes a number of subelements.
247 #
248 # @param start The first subelement to delete.
249 # @param stop The first subelement to leave in there.
250
251 def __delslice__(self, start, stop):
252 del self._children[start:stop]
253
254 ##
255 # Adds a subelement to the end of this element.
256 #
257 # @param element The element to add.
258 # @exception AssertionError If a sequence member is not a valid object.
259
260 def append(self, element):
261 assert iselement(element)
262 self._children.append(element)
263
264 ##
265 # Inserts a subelement at the given position in this element.
266 #
267 # @param index Where to insert the new subelement.
268 # @exception AssertionError If the element is not a valid object.
269
270 def insert(self, index, element):
271 assert iselement(element)
272 self._children.insert(index, element)
273
274 ##
275 # Removes a matching subelement. Unlike the <b>find</b> methods,
276 # this method compares elements based on identity, not on tag
277 # value or contents.
278 #
279 # @param element What element to remove.
280 # @exception ValueError If a matching element could not be found.
281 # @exception AssertionError If the element is not a valid object.
282
283 def remove(self, element):
284 assert iselement(element)
285 self._children.remove(element)
286
287 ##
288 # Returns all subelements. The elements are returned in document
289 # order.
290 #
291 # @return A list of subelements.
292 # @defreturn list of Element instances
293
294 def getchildren(self):
295 return self._children
296
297 ##
298 # Finds the first matching subelement, by tag name or path.
299 #
300 # @param path What element to look for.
301 # @return The first matching element, or None if no element was found.
302 # @defreturn Element or None
303
304 def find(self, path):
305 return ElementPath.find(self, path)
306
307 ##
308 # Finds text for the first matching subelement, by tag name or path.
309 #
310 # @param path What element to look for.
311 # @param default What to return if the element was not found.
312 # @return The text content of the first matching element, or the
313 # default value no element was found. Note that if the element
314 # has is found, but has no text content, this method returns an
315 # empty string.
316 # @defreturn string
317
318 def findtext(self, path, default=None):
319 return ElementPath.findtext(self, path, default)
320
321 ##
322 # Finds all matching subelements, by tag name or path.
323 #
324 # @param path What element to look for.
325 # @return A list or iterator containing all matching elements,
326 # in document order.
327 # @defreturn list of Element instances
328
329 def findall(self, path):
330 return ElementPath.findall(self, path)
331
332 ##
333 # Resets an element. This function removes all subelements, clears
334 # all attributes, and sets the text and tail attributes to None.
335
336 def clear(self):
337 self.attrib.clear()
338 self._children = []
339 self.text = self.tail = None
340
341 ##
342 # Gets an element attribute.
343 #
344 # @param key What attribute to look for.
345 # @param default What to return if the attribute was not found.
346 # @return The attribute value, or the default value, if the
347 # attribute was not found.
348 # @defreturn string or None
349
350 def get(self, key, default=None):
351 return self.attrib.get(key, default)
352
353 ##
354 # Sets an element attribute.
355 #
356 # @param key What attribute to set.
357 # @param value The attribute value.
358
359 def set(self, key, value):
360 self.attrib[key] = value
361
362 ##
363 # Gets a list of attribute names. The names are returned in an
364 # arbitrary order (just like for an ordinary Python dictionary).
365 #
366 # @return A list of element attribute names.
367 # @defreturn list of strings
368
369 def keys(self):
370 return self.attrib.keys()
371
372 ##
373 # Gets element attributes, as a sequence. The attributes are
374 # returned in an arbitrary order.
375 #
376 # @return A list of (name, value) tuples for all attributes.
377 # @defreturn list of (string, string) tuples
378
379 def items(self):
380 return self.attrib.items()
381
382 ##
383 # Creates a tree iterator. The iterator loops over this element
384 # and all subelements, in document order, and returns all elements
385 # with a matching tag.
386 # <p>
387 # If the tree structure is modified during iteration, the result
388 # is undefined.
389 #
390 # @param tag What tags to look for (default is to return all elements).
391 # @return A list or iterator containing all the matching elements.
392 # @defreturn list or iterator
393
394 def getiterator(self, tag=None):
395 nodes = []
396 if tag == "*":
397 tag = None
398 if tag is None or self.tag == tag:
399 nodes.append(self)
400 for node in self._children:
401 nodes.extend(node.getiterator(tag))
402 return nodes
403
404# compatibility
405_Element = _ElementInterface
406
407##
408# Element factory. This function returns an object implementing the
409# standard Element interface. The exact class or type of that object
410# is implementation dependent, but it will always be compatible with
411# the {@link #_ElementInterface} class in this module.
412# <p>
413# The element name, attribute names, and attribute values can be
414# either 8-bit ASCII strings or Unicode strings.
415#
416# @param tag The element name.
417# @param attrib An optional dictionary, containing element attributes.
418# @param **extra Additional attributes, given as keyword arguments.
419# @return An element instance.
420# @defreturn Element
421
422def Element(tag, attrib={}, **extra):
423 attrib = attrib.copy()
424 attrib.update(extra)
425 return _ElementInterface(tag, attrib)
426
427##
428# Subelement factory. This function creates an element instance, and
429# appends it to an existing element.
430# <p>
431# The element name, attribute names, and attribute values can be
432# either 8-bit ASCII strings or Unicode strings.
433#
434# @param parent The parent element.
435# @param tag The subelement name.
436# @param attrib An optional dictionary, containing element attributes.
437# @param **extra Additional attributes, given as keyword arguments.
438# @return An element instance.
439# @defreturn Element
440
441def SubElement(parent, tag, attrib={}, **extra):
442 attrib = attrib.copy()
443 attrib.update(extra)
444 element = parent.makeelement(tag, attrib)
445 parent.append(element)
446 return element
447
448##
449# Comment element factory. This factory function creates a special
450# element that will be serialized as an XML comment.
451# <p>
452# The comment string can be either an 8-bit ASCII string or a Unicode
453# string.
454#
455# @param text A string containing the comment string.
456# @return An element instance, representing a comment.
457# @defreturn Element
458
459def Comment(text=None):
460 element = Element(Comment)
461 element.text = text
462 return element
463
464##
465# PI element factory. This factory function creates a special element
466# that will be serialized as an XML processing instruction.
467#
468# @param target A string containing the PI target.
469# @param text A string containing the PI contents, if any.
470# @return An element instance, representing a PI.
471# @defreturn Element
472
473def ProcessingInstruction(target, text=None):
474 element = Element(ProcessingInstruction)
475 element.text = target
476 if text:
477 element.text = element.text + " " + text
478 return element
479
480PI = ProcessingInstruction
481
482##
483# QName wrapper. This can be used to wrap a QName attribute value, in
484# order to get proper namespace handling on output.
485#
486# @param text A string containing the QName value, in the form {uri}local,
487# or, if the tag argument is given, the URI part of a QName.
488# @param tag Optional tag. If given, the first argument is interpreted as
489# an URI, and this argument is interpreted as a local name.
490# @return An opaque object, representing the QName.
491
492class QName:
493 def __init__(self, text_or_uri, tag=None):
494 if tag:
495 text_or_uri = "{%s}%s" % (text_or_uri, tag)
496 self.text = text_or_uri
497 def __str__(self):
498 return self.text
499 def __hash__(self):
500 return hash(self.text)
Mark Dickinsona56c4672009-01-27 18:17:45 +0000501 def __le__(self, other):
Armin Rigo9ed73062005-12-14 18:10:45 +0000502 if isinstance(other, QName):
Mark Dickinsona56c4672009-01-27 18:17:45 +0000503 return self.text <= other.text
504 return self.text <= other
505 def __lt__(self, other):
506 if isinstance(other, QName):
507 return self.text < other.text
508 return self.text < other
509 def __ge__(self, other):
510 if isinstance(other, QName):
511 return self.text >= other.text
512 return self.text >= other
513 def __gt__(self, other):
514 if isinstance(other, QName):
515 return self.text > other.text
516 return self.text > other
517 def __eq__(self, other):
518 if isinstance(other, QName):
519 return self.text == other.text
520 return self.text == other
521 def __ne__(self, other):
522 if isinstance(other, QName):
523 return self.text != other.text
524 return self.text != other
Armin Rigo9ed73062005-12-14 18:10:45 +0000525
526##
527# ElementTree wrapper class. This class represents an entire element
528# hierarchy, and adds some extra support for serialization to and from
529# standard XML.
530#
531# @param element Optional root element.
532# @keyparam file Optional file handle or name. If given, the
533# tree is initialized with the contents of this XML file.
534
535class ElementTree:
536
537 def __init__(self, element=None, file=None):
538 assert element is None or iselement(element)
539 self._root = element # first node
540 if file:
541 self.parse(file)
542
543 ##
544 # Gets the root element for this tree.
545 #
546 # @return An element instance.
547 # @defreturn Element
548
549 def getroot(self):
550 return self._root
551
552 ##
553 # Replaces the root element for this tree. This discards the
554 # current contents of the tree, and replaces it with the given
555 # element. Use with care.
556 #
557 # @param element An element instance.
558
559 def _setroot(self, element):
560 assert iselement(element)
561 self._root = element
562
563 ##
564 # Loads an external XML document into this element tree.
565 #
566 # @param source A file name or file object.
567 # @param parser An optional parser instance. If not given, the
568 # standard {@link XMLTreeBuilder} parser is used.
569 # @return The document root element.
570 # @defreturn Element
571
572 def parse(self, source, parser=None):
573 if not hasattr(source, "read"):
574 source = open(source, "rb")
575 if not parser:
576 parser = XMLTreeBuilder()
577 while 1:
578 data = source.read(32768)
579 if not data:
580 break
581 parser.feed(data)
582 self._root = parser.close()
583 return self._root
584
585 ##
586 # Creates a tree iterator for the root element. The iterator loops
587 # over all elements in this tree, in document order.
588 #
589 # @param tag What tags to look for (default is to return all elements)
590 # @return An iterator.
591 # @defreturn iterator
592
593 def getiterator(self, tag=None):
594 assert self._root is not None
595 return self._root.getiterator(tag)
596
597 ##
598 # Finds the first toplevel element with given tag.
599 # Same as getroot().find(path).
600 #
601 # @param path What element to look for.
602 # @return The first matching element, or None if no element was found.
603 # @defreturn Element or None
604
605 def find(self, path):
606 assert self._root is not None
607 if path[:1] == "/":
608 path = "." + path
609 return self._root.find(path)
610
611 ##
612 # Finds the element text for the first toplevel element with given
613 # tag. Same as getroot().findtext(path).
614 #
615 # @param path What toplevel element to look for.
616 # @param default What to return if the element was not found.
617 # @return The text content of the first matching element, or the
618 # default value no element was found. Note that if the element
619 # has is found, but has no text content, this method returns an
620 # empty string.
621 # @defreturn string
622
623 def findtext(self, path, default=None):
624 assert self._root is not None
625 if path[:1] == "/":
626 path = "." + path
627 return self._root.findtext(path, default)
628
629 ##
630 # Finds all toplevel elements with the given tag.
631 # Same as getroot().findall(path).
632 #
633 # @param path What element to look for.
634 # @return A list or iterator containing all matching elements,
635 # in document order.
636 # @defreturn list of Element instances
637
638 def findall(self, path):
639 assert self._root is not None
640 if path[:1] == "/":
641 path = "." + path
642 return self._root.findall(path)
643
644 ##
645 # Writes the element tree to a file, as XML.
646 #
647 # @param file A file name, or a file object opened for writing.
Guido van Rossum34d19282007-08-09 01:03:29 +0000648 # @param encoding Optional output encoding (default is None)
Armin Rigo9ed73062005-12-14 18:10:45 +0000649
Guido van Rossum34d19282007-08-09 01:03:29 +0000650 def write(self, file, encoding=None):
Armin Rigo9ed73062005-12-14 18:10:45 +0000651 assert self._root is not None
652 if not hasattr(file, "write"):
Guido van Rossum34d19282007-08-09 01:03:29 +0000653 if encoding:
654 file = open(file, "wb")
655 else:
656 file = open(file, "w")
657 if encoding and encoding != "utf-8":
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000658 file.write(_encode("<?xml version='1.0' encoding='%s'?>\n" % encoding, encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000659 self._write(file, self._root, encoding, {})
660
661 def _write(self, file, node, encoding, namespaces):
662 # write XML to file
663 tag = node.tag
664 if tag is Comment:
Antoine Pitrou54319282010-02-09 16:53:09 +0000665 file.write(b"<!-- " + _encode_cdata(node.text, encoding) + b" -->")
Armin Rigo9ed73062005-12-14 18:10:45 +0000666 elif tag is ProcessingInstruction:
Antoine Pitrou54319282010-02-09 16:53:09 +0000667 file.write(b"<?" + _encode_cdata(node.text, encoding) + b"?>")
Armin Rigo9ed73062005-12-14 18:10:45 +0000668 else:
Guido van Rossumcc2b0162007-02-11 06:12:03 +0000669 items = list(node.items())
Armin Rigo9ed73062005-12-14 18:10:45 +0000670 xmlns_items = [] # new namespaces in this scope
671 try:
672 if isinstance(tag, QName) or tag[:1] == "{":
673 tag, xmlns = fixtag(tag, namespaces)
674 if xmlns: xmlns_items.append(xmlns)
675 except TypeError:
676 _raise_serialization_error(tag)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000677 file.write(_encode("<" + tag, encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000678 if items or xmlns_items:
679 items.sort() # lexical order
680 for k, v in items:
681 try:
682 if isinstance(k, QName) or k[:1] == "{":
683 k, xmlns = fixtag(k, namespaces)
684 if xmlns: xmlns_items.append(xmlns)
685 except TypeError:
686 _raise_serialization_error(k)
687 try:
688 if isinstance(v, QName):
689 v, xmlns = fixtag(v, namespaces)
690 if xmlns: xmlns_items.append(xmlns)
691 except TypeError:
692 _raise_serialization_error(v)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000693 file.write(_encode(" %s=\"%s\"" % (k, _escape_attrib(v)), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000694 for k, v in xmlns_items:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000695 file.write(_encode(" %s=\"%s\"" % (k, _escape_attrib(v)), encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000696 if node.text or len(node):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000697 file.write(_encode(">", encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000698 if node.text:
Antoine Pitrou54319282010-02-09 16:53:09 +0000699 file.write(_encode_cdata(node.text, encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000700 for n in node:
701 self._write(file, n, encoding, namespaces)
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000702 file.write(_encode("</" + tag + ">", encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000703 else:
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000704 file.write(_encode(" />", encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000705 for k, v in xmlns_items:
706 del namespaces[v]
707 if node.tail:
Antoine Pitrou54319282010-02-09 16:53:09 +0000708 file.write(_encode_cdata(node.tail, encoding))
Armin Rigo9ed73062005-12-14 18:10:45 +0000709
710# --------------------------------------------------------------------
711# helpers
712
713##
714# Checks if an object appears to be a valid element object.
715#
716# @param An element instance.
717# @return A true value if this is an element object.
718# @defreturn flag
719
720def iselement(element):
721 # FIXME: not sure about this; might be a better idea to look
722 # for tag/attrib/text attributes
723 return isinstance(element, _ElementInterface) or hasattr(element, "tag")
724
725##
726# Writes an element tree or element structure to sys.stdout. This
727# function should be used for debugging only.
728# <p>
729# The exact output format is implementation dependent. In this
730# version, it's written as an ordinary XML file.
731#
732# @param elem An element tree or an individual element.
733
734def dump(elem):
735 # debugging
736 if not isinstance(elem, ElementTree):
737 elem = ElementTree(elem)
738 elem.write(sys.stdout)
739 tail = elem.getroot().tail
740 if not tail or tail[-1] != "\n":
741 sys.stdout.write("\n")
742
743def _encode(s, encoding):
Guido van Rossum34d19282007-08-09 01:03:29 +0000744 if encoding:
Armin Rigo9ed73062005-12-14 18:10:45 +0000745 return s.encode(encoding)
Guido van Rossum34d19282007-08-09 01:03:29 +0000746 else:
747 return s
Armin Rigo9ed73062005-12-14 18:10:45 +0000748
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000749_escape = re.compile(r"[&<>\"\u0080-\uffff]+")
Armin Rigo9ed73062005-12-14 18:10:45 +0000750
751_escape_map = {
752 "&": "&amp;",
753 "<": "&lt;",
754 ">": "&gt;",
755 '"': "&quot;",
756}
757
758_namespace_map = {
759 # "well-known" namespace prefixes
760 "http://www.w3.org/XML/1998/namespace": "xml",
761 "http://www.w3.org/1999/xhtml": "html",
762 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
763 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
764}
765
766def _raise_serialization_error(text):
767 raise TypeError(
768 "cannot serialize %r (type %s)" % (text, type(text).__name__)
769 )
770
771def _encode_entity(text, pattern=_escape):
772 # map reserved and non-ascii characters to numerical entities
773 def escape_entities(m, map=_escape_map):
774 out = []
775 append = out.append
776 for char in m.group():
777 text = map.get(char)
778 if text is None:
779 text = "&#%d;" % ord(char)
780 append(text)
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000781 return "".join(out)
Armin Rigo9ed73062005-12-14 18:10:45 +0000782 try:
783 return _encode(pattern.sub(escape_entities, text), "ascii")
784 except TypeError:
785 _raise_serialization_error(text)
786
787#
788# the following functions assume an ascii-compatible encoding
789# (or "utf-16")
790
Antoine Pitrou54319282010-02-09 16:53:09 +0000791def _encode_cdata(text, encoding):
Armin Rigo9ed73062005-12-14 18:10:45 +0000792 # escape character data
793 try:
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000794 text = text.replace("&", "&amp;")
795 text = text.replace("<", "&lt;")
796 text = text.replace(">", "&gt;")
Antoine Pitrou54319282010-02-09 16:53:09 +0000797 if encoding:
798 return text.encode(encoding, "xmlcharrefreplace")
799 else:
800 return text
Armin Rigo9ed73062005-12-14 18:10:45 +0000801 except (TypeError, AttributeError):
802 _raise_serialization_error(text)
803
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +0000804def _escape_attrib(text):
Armin Rigo9ed73062005-12-14 18:10:45 +0000805 # escape attribute value
806 try:
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000807 text = text.replace("&", "&amp;")
808 text = text.replace("'", "&apos;") # FIXME: overkill
809 text = text.replace("\"", "&quot;")
810 text = text.replace("<", "&lt;")
811 text = text.replace(">", "&gt;")
Armin Rigo9ed73062005-12-14 18:10:45 +0000812 return text
813 except (TypeError, AttributeError):
814 _raise_serialization_error(text)
815
816def fixtag(tag, namespaces):
817 # given a decorated tag (of the form {uri}tag), return prefixed
818 # tag and namespace declaration, if any
819 if isinstance(tag, QName):
820 tag = tag.text
Neal Norwitz9d72bb42007-04-17 08:48:32 +0000821 namespace_uri, tag = tag[1:].split("}", 1)
Armin Rigo9ed73062005-12-14 18:10:45 +0000822 prefix = namespaces.get(namespace_uri)
823 if prefix is None:
824 prefix = _namespace_map.get(namespace_uri)
825 if prefix is None:
826 prefix = "ns%d" % len(namespaces)
827 namespaces[namespace_uri] = prefix
828 if prefix == "xml":
829 xmlns = None
830 else:
831 xmlns = ("xmlns:%s" % prefix, namespace_uri)
832 else:
833 xmlns = None
834 return "%s:%s" % (prefix, tag), xmlns
835
836##
837# Parses an XML document into an element tree.
838#
839# @param source A filename or file object containing XML data.
840# @param parser An optional parser instance. If not given, the
841# standard {@link XMLTreeBuilder} parser is used.
842# @return An ElementTree instance
843
844def parse(source, parser=None):
845 tree = ElementTree()
846 tree.parse(source, parser)
847 return tree
848
849##
850# Parses an XML document into an element tree incrementally, and reports
851# what's going on to the user.
852#
853# @param source A filename or file object containing XML data.
854# @param events A list of events to report back. If omitted, only "end"
855# events are reported.
856# @return A (event, elem) iterator.
857
858class iterparse:
859
860 def __init__(self, source, events=None):
861 if not hasattr(source, "read"):
862 source = open(source, "rb")
863 self._file = source
864 self._events = []
865 self._index = 0
866 self.root = self._root = None
867 self._parser = XMLTreeBuilder()
868 # wire up the parser for event reporting
869 parser = self._parser._parser
870 append = self._events.append
871 if events is None:
872 events = ["end"]
873 for event in events:
874 if event == "start":
875 try:
876 parser.ordered_attributes = 1
877 parser.specified_attributes = 1
878 def handler(tag, attrib_in, event=event, append=append,
879 start=self._parser._start_list):
880 append((event, start(tag, attrib_in)))
881 parser.StartElementHandler = handler
882 except AttributeError:
883 def handler(tag, attrib_in, event=event, append=append,
884 start=self._parser._start):
885 append((event, start(tag, attrib_in)))
886 parser.StartElementHandler = handler
887 elif event == "end":
888 def handler(tag, event=event, append=append,
889 end=self._parser._end):
890 append((event, end(tag)))
891 parser.EndElementHandler = handler
892 elif event == "start-ns":
893 def handler(prefix, uri, event=event, append=append):
894 try:
895 uri = _encode(uri, "ascii")
896 except UnicodeError:
897 pass
898 append((event, (prefix or "", uri)))
899 parser.StartNamespaceDeclHandler = handler
900 elif event == "end-ns":
901 def handler(prefix, event=event, append=append):
902 append((event, None))
903 parser.EndNamespaceDeclHandler = handler
904
Georg Brandla18af4e2007-04-21 15:47:16 +0000905 def __next__(self):
Armin Rigo9ed73062005-12-14 18:10:45 +0000906 while 1:
907 try:
908 item = self._events[self._index]
909 except IndexError:
910 if self._parser is None:
911 self.root = self._root
912 try:
913 raise StopIteration
914 except NameError:
915 raise IndexError
916 # load event buffer
917 del self._events[:]
918 self._index = 0
919 data = self._file.read(16384)
920 if data:
921 self._parser.feed(data)
922 else:
923 self._root = self._parser.close()
924 self._parser = None
925 else:
926 self._index = self._index + 1
927 return item
928
929 try:
930 iter
931 def __iter__(self):
932 return self
933 except NameError:
934 def __getitem__(self, index):
Georg Brandla18af4e2007-04-21 15:47:16 +0000935 return self.__next__()
Armin Rigo9ed73062005-12-14 18:10:45 +0000936
937##
938# Parses an XML document from a string constant. This function can
939# be used to embed "XML literals" in Python code.
940#
941# @param source A string containing XML data.
942# @return An Element instance.
943# @defreturn Element
944
945def XML(text):
946 parser = XMLTreeBuilder()
947 parser.feed(text)
948 return parser.close()
949
950##
951# Parses an XML document from a string constant, and also returns
952# a dictionary which maps from element id:s to elements.
953#
954# @param source A string containing XML data.
955# @return A tuple containing an Element instance and a dictionary.
956# @defreturn (Element, dictionary)
957
958def XMLID(text):
959 parser = XMLTreeBuilder()
960 parser.feed(text)
961 tree = parser.close()
962 ids = {}
963 for elem in tree.getiterator():
964 id = elem.get("id")
965 if id:
966 ids[id] = elem
967 return tree, ids
968
969##
970# Parses an XML document from a string constant. Same as {@link #XML}.
971#
972# @def fromstring(text)
973# @param source A string containing XML data.
974# @return An Element instance.
975# @defreturn Element
976
977fromstring = XML
978
979##
980# Generates a string representation of an XML element, including all
Guido van Rossum34d19282007-08-09 01:03:29 +0000981# subelements. If encoding is None, the return type is a string;
982# otherwise it is a bytes array.
Armin Rigo9ed73062005-12-14 18:10:45 +0000983#
984# @param element An Element instance.
Guido van Rossum34d19282007-08-09 01:03:29 +0000985# @return An (optionally) encoded string containing the XML data.
Armin Rigo9ed73062005-12-14 18:10:45 +0000986# @defreturn string
987
988def tostring(element, encoding=None):
989 class dummy:
990 pass
991 data = []
992 file = dummy()
993 file.write = data.append
994 ElementTree(element).write(file, encoding)
Guido van Rossum34d19282007-08-09 01:03:29 +0000995 if encoding:
996 return b"".join(data)
997 else:
998 return "".join(data)
Armin Rigo9ed73062005-12-14 18:10:45 +0000999
1000##
1001# Generic element structure builder. This builder converts a sequence
1002# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1003# #TreeBuilder.end} method calls to a well-formed element structure.
1004# <p>
1005# You can use this class to build an element structure using a custom XML
1006# parser, or a parser for some other XML-like format.
1007#
1008# @param element_factory Optional element factory. This factory
1009# is called to create new Element instances, as necessary.
1010
1011class TreeBuilder:
1012
1013 def __init__(self, element_factory=None):
1014 self._data = [] # data collector
1015 self._elem = [] # element stack
1016 self._last = None # last element
1017 self._tail = None # true if we're after an end tag
1018 if element_factory is None:
1019 element_factory = _ElementInterface
1020 self._factory = element_factory
1021
1022 ##
1023 # Flushes the parser buffers, and returns the toplevel documen
1024 # element.
1025 #
1026 # @return An Element instance.
1027 # @defreturn Element
1028
1029 def close(self):
1030 assert len(self._elem) == 0, "missing end tags"
1031 assert self._last != None, "missing toplevel element"
1032 return self._last
1033
1034 def _flush(self):
1035 if self._data:
1036 if self._last is not None:
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001037 text = "".join(self._data)
Armin Rigo9ed73062005-12-14 18:10:45 +00001038 if self._tail:
1039 assert self._last.tail is None, "internal error (tail)"
1040 self._last.tail = text
1041 else:
1042 assert self._last.text is None, "internal error (text)"
1043 self._last.text = text
1044 self._data = []
1045
1046 ##
1047 # Adds text to the current element.
1048 #
1049 # @param data A string. This should be either an 8-bit string
1050 # containing ASCII text, or a Unicode string.
1051
1052 def data(self, data):
1053 self._data.append(data)
1054
1055 ##
1056 # Opens a new element.
1057 #
1058 # @param tag The element name.
1059 # @param attrib A dictionary containing element attributes.
1060 # @return The opened element.
1061 # @defreturn Element
1062
1063 def start(self, tag, attrs):
1064 self._flush()
1065 self._last = elem = self._factory(tag, attrs)
1066 if self._elem:
1067 self._elem[-1].append(elem)
1068 self._elem.append(elem)
1069 self._tail = 0
1070 return elem
1071
1072 ##
1073 # Closes the current element.
1074 #
1075 # @param tag The element name.
1076 # @return The closed element.
1077 # @defreturn Element
1078
1079 def end(self, tag):
1080 self._flush()
1081 self._last = self._elem.pop()
1082 assert self._last.tag == tag,\
1083 "end tag mismatch (expected %s, got %s)" % (
1084 self._last.tag, tag)
1085 self._tail = 1
1086 return self._last
1087
1088##
1089# Element structure builder for XML source data, based on the
1090# <b>expat</b> parser.
1091#
1092# @keyparam target Target object. If omitted, the builder uses an
1093# instance of the standard {@link #TreeBuilder} class.
1094# @keyparam html Predefine HTML entities. This flag is not supported
1095# by the current implementation.
1096# @see #ElementTree
1097# @see #TreeBuilder
1098
1099class XMLTreeBuilder:
1100
1101 def __init__(self, html=0, target=None):
1102 try:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001103 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001104 except ImportError:
1105 raise ImportError(
1106 "No module named expat; use SimpleXMLTreeBuilder instead"
1107 )
1108 self._parser = parser = expat.ParserCreate(None, "}")
1109 if target is None:
1110 target = TreeBuilder()
1111 self._target = target
1112 self._names = {} # name memo cache
1113 # callbacks
1114 parser.DefaultHandlerExpand = self._default
1115 parser.StartElementHandler = self._start
1116 parser.EndElementHandler = self._end
1117 parser.CharacterDataHandler = self._data
1118 # let expat do the buffering, if supported
1119 try:
1120 self._parser.buffer_text = 1
1121 except AttributeError:
1122 pass
1123 # use new-style attribute handling, if supported
1124 try:
1125 self._parser.ordered_attributes = 1
1126 self._parser.specified_attributes = 1
1127 parser.StartElementHandler = self._start_list
1128 except AttributeError:
1129 pass
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001130 encoding = "utf-8"
Armin Rigo9ed73062005-12-14 18:10:45 +00001131 # target.xml(encoding, None)
1132 self._doctype = None
1133 self.entity = {}
1134
Armin Rigo9ed73062005-12-14 18:10:45 +00001135 def _fixname(self, key):
1136 # expand qname, and convert name string to ascii, if possible
1137 try:
1138 name = self._names[key]
1139 except KeyError:
1140 name = key
1141 if "}" in name:
1142 name = "{" + name
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001143 self._names[key] = name
Armin Rigo9ed73062005-12-14 18:10:45 +00001144 return name
1145
1146 def _start(self, tag, attrib_in):
1147 fixname = self._fixname
1148 tag = fixname(tag)
1149 attrib = {}
1150 for key, value in attrib_in.items():
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001151 attrib[fixname(key)] = value
Armin Rigo9ed73062005-12-14 18:10:45 +00001152 return self._target.start(tag, attrib)
1153
1154 def _start_list(self, tag, attrib_in):
1155 fixname = self._fixname
1156 tag = fixname(tag)
1157 attrib = {}
1158 if attrib_in:
1159 for i in range(0, len(attrib_in), 2):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001160 attrib[fixname(attrib_in[i])] = attrib_in[i+1]
Armin Rigo9ed73062005-12-14 18:10:45 +00001161 return self._target.start(tag, attrib)
1162
1163 def _data(self, text):
Martin v. Löwisf30bb0e2007-07-28 11:40:46 +00001164 return self._target.data(text)
Armin Rigo9ed73062005-12-14 18:10:45 +00001165
1166 def _end(self, tag):
1167 return self._target.end(self._fixname(tag))
1168
1169 def _default(self, text):
1170 prefix = text[:1]
1171 if prefix == "&":
1172 # deal with undefined entities
1173 try:
1174 self._target.data(self.entity[text[1:-1]])
1175 except KeyError:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001176 from xml.parsers import expat
Armin Rigo9ed73062005-12-14 18:10:45 +00001177 raise expat.error(
1178 "undefined entity %s: line %d, column %d" %
1179 (text, self._parser.ErrorLineNumber,
1180 self._parser.ErrorColumnNumber)
1181 )
1182 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1183 self._doctype = [] # inside a doctype declaration
1184 elif self._doctype is not None:
1185 # parse doctype contents
1186 if prefix == ">":
1187 self._doctype = None
1188 return
Neal Norwitz9d72bb42007-04-17 08:48:32 +00001189 text = text.strip()
Armin Rigo9ed73062005-12-14 18:10:45 +00001190 if not text:
1191 return
1192 self._doctype.append(text)
1193 n = len(self._doctype)
1194 if n > 2:
1195 type = self._doctype[1]
1196 if type == "PUBLIC" and n == 4:
1197 name, type, pubid, system = self._doctype
1198 elif type == "SYSTEM" and n == 3:
1199 name, type, system = self._doctype
1200 pubid = None
1201 else:
1202 return
1203 if pubid:
1204 pubid = pubid[1:-1]
1205 self.doctype(name, pubid, system[1:-1])
1206 self._doctype = None
1207
1208 ##
1209 # Handles a doctype declaration.
1210 #
1211 # @param name Doctype name.
1212 # @param pubid Public identifier.
1213 # @param system System identifier.
1214
1215 def doctype(self, name, pubid, system):
1216 pass
1217
1218 ##
1219 # Feeds data to the parser.
1220 #
1221 # @param data Encoded data.
1222
1223 def feed(self, data):
1224 self._parser.Parse(data, 0)
1225
1226 ##
1227 # Finishes feeding data to the parser.
1228 #
1229 # @return An element structure.
1230 # @defreturn Element
1231
1232 def close(self):
1233 self._parser.Parse("", 1) # end of data
1234 tree = self._target.close()
1235 del self._target, self._parser # get rid of circular references
1236 return tree
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001237
1238# compatibility
1239XMLParser = XMLTreeBuilder