Blame - Lib/xml/etree/ElementTree.py - platform/external/python/cpython3

blob: 694d6c4a9091629d017232635dba75660cc82df7 [file] [log] [blame]

Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	1	#
				2	# ElementTree
				3	# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
				4	#
				5	# light-weight XML support for Python 1.5.2 and later.
				6	#
				7	# history:
				8	# 2001-10-20 fl created (from various sources)
				9	# 2001-11-01 fl return root from parse method
				10	# 2002-02-16 fl sort attributes in lexical order
				11	# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
				12	# 2002-05-01 fl finished TreeBuilder refactoring
				13	# 2002-07-14 fl added basic namespace support to ElementTree.write
				14	# 2002-07-25 fl added QName attribute support
				15	# 2002-10-20 fl fixed encoding in write
				16	# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
				17	# 2002-11-27 fl accept file objects or file names for parse/write
				18	# 2002-12-04 fl moved XMLTreeBuilder back to this module
				19	# 2003-01-11 fl fixed entity encoding glitch for us-ascii
				20	# 2003-02-13 fl added XML literal factory
				21	# 2003-02-21 fl added ProcessingInstruction/PI factory
				22	# 2003-05-11 fl added tostring/fromstring helpers
				23	# 2003-05-26 fl added ElementPath support
				24	# 2003-07-05 fl added makeelement factory method
				25	# 2003-07-28 fl added more well-known namespace prefixes
				26	# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
				27	# 2003-09-04 fl fall back on emulator if ElementPath is not installed
				28	# 2003-10-31 fl markup updates
				29	# 2003-11-15 fl fixed nested namespace bug
				30	# 2004-03-28 fl added XMLID helper
				31	# 2004-06-02 fl added default support to findtext
				32	# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
				33	# 2004-08-23 fl take advantage of post-2.1 expat features
				34	# 2005-02-01 fl added iterparse implementation
				35	# 2005-03-02 fl fixed iterparse support for pre-2.2 versions
				36	#
				37	# Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved.
				38	#
				39	# fredrik@pythonware.com
				40	# http://www.pythonware.com
				41	#
				42	# --------------------------------------------------------------------
				43	# The ElementTree toolkit is
				44	#
				45	# Copyright (c) 1999-2005 by Fredrik Lundh
				46	#
				47	# By obtaining, using, and/or copying this software and/or its
				48	# associated documentation, you agree that you have read, understood,
				49	# and will comply with the following terms and conditions:
				50	#
				51	# Permission to use, copy, modify, and distribute this software and
				52	# its associated documentation for any purpose and without fee is
				53	# hereby granted, provided that the above copyright notice appears in
				54	# all copies, and that both that copyright notice and this permission
				55	# notice appear in supporting documentation, and that the name of
				56	# Secret Labs AB or the author not be used in advertising or publicity
				57	# pertaining to distribution of the software without specific, written
				58	# prior permission.
				59	#
				60	# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
				61	# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
				62	# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
				63	# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
				64	# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
				65	# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
				66	# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
				67	# OF THIS SOFTWARE.
				68	# --------------------------------------------------------------------
				69
Fredrik Lundh	63168a5	2005-12-14 22:29:34 +0000	[diff] [blame]	70	# Licensed to PSF under a Contributor Agreement.
				71	# See http://www.python.org/2.4/license for licensing details.
				72
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	73	__all__ = [
				74	# public symbols
				75	"Comment",
				76	"dump",
				77	"Element", "ElementTree",
				78	"fromstring",
				79	"iselement", "iterparse",
				80	"parse",
				81	"PI", "ProcessingInstruction",
				82	"QName",
				83	"SubElement",
				84	"tostring",
				85	"TreeBuilder",
				86	"VERSION", "XML",
Thomas Wouters	0e3f591	2006-08-11 14:57:12 +0000	[diff] [blame]	87	"XMLParser", "XMLTreeBuilder",
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	88	]
				89
				90	##
				91	# The <b>Element</b> type is a flexible container object, designed to
				92	# store hierarchical data structures in memory. The type can be
				93	# described as a cross between a list and a dictionary.
				94	# <p>
				95	# Each element has a number of properties associated with it:
				96	# <ul>
				97	# <li>a <i>tag</i>. This is a string identifying what kind of data
				98	# this element represents (the element type, in other words).</li>
				99	# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
				100	# <li>a <i>text</i> string.</li>
				101	# <li>an optional <i>tail</i> string.</li>
				102	# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
				103	# </ul>
				104	#
				105	# To create an element instance, use the {@link #Element} or {@link
				106	# #SubElement} factory functions.
				107	# <p>
				108	# The {@link #ElementTree} class can be used to wrap an element
				109	# structure, and convert it from and to XML.
				110	##
				111
Neal Norwitz	9d72bb4	2007-04-17 08:48:32 +0000	[diff] [blame]	112	import sys, re
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	113
Alex Martelli	6cefeb0	2006-08-21 23:45:19 +0000	[diff] [blame]	114	from . import ElementPath
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	115
				116	# TODO: add support for custom namespace resolvers/default namespaces
				117	# TODO: add improved support for incremental parsing
				118
				119	VERSION = "1.2.6"
				120
				121	##
				122	# Internal element class. This class defines the Element interface,
				123	# and provides a reference implementation of this interface.
				124	# <p>
				125	# You should not create instances of this class directly. Use the
				126	# appropriate factory functions instead, such as {@link #Element}
				127	# and {@link #SubElement}.
				128	#
				129	# @see Element
				130	# @see SubElement
				131	# @see Comment
				132	# @see ProcessingInstruction
				133
				134	class _ElementInterface:
				135	# <tag attrib>text<child/>...</tag>tail
				136
				137	##
				138	# (Attribute) Element tag.
				139
				140	tag = None
				141
				142	##
				143	# (Attribute) Element attribute dictionary. Where possible, use
				144	# {@link #_ElementInterface.get},
				145	# {@link #_ElementInterface.set},
				146	# {@link #_ElementInterface.keys}, and
				147	# {@link #_ElementInterface.items} to access
				148	# element attributes.
				149
				150	attrib = None
				151
				152	##
				153	# (Attribute) Text before first subelement. This is either a
				154	# string or the value None, if there was no text.
				155
				156	text = None
				157
				158	##
				159	# (Attribute) Text after this element's end tag, but before the
				160	# next sibling element's start tag. This is either a string or
				161	# the value None, if there was no text.
				162
				163	tail = None # text after end tag, if any
				164
				165	def __init__(self, tag, attrib):
				166	self.tag = tag
				167	self.attrib = attrib
				168	self._children = []
				169
				170	def __repr__(self):
				171	return "<Element %s at %x>" % (self.tag, id(self))
				172
				173	##
				174	# Creates a new element object of the same type as this element.
				175	#
				176	# @param tag Element tag.
				177	# @param attrib Element attributes, given as a dictionary.
				178	# @return A new element instance.
				179
				180	def makeelement(self, tag, attrib):
				181	return Element(tag, attrib)
				182
				183	##
				184	# Returns the number of subelements.
				185	#
				186	# @return The number of subelements.
				187
				188	def __len__(self):
				189	return len(self._children)
				190
				191	##
				192	# Returns the given subelement.
				193	#
				194	# @param index What subelement to return.
				195	# @return The given subelement.
				196	# @exception IndexError If the given element does not exist.
				197
				198	def __getitem__(self, index):
				199	return self._children[index]
				200
				201	##
				202	# Replaces the given subelement.
				203	#
				204	# @param index What subelement to replace.
				205	# @param element The new element value.
				206	# @exception IndexError If the given element does not exist.
				207	# @exception AssertionError If element is not a valid object.
				208
				209	def __setitem__(self, index, element):
				210	assert iselement(element)
				211	self._children[index] = element
				212
				213	##
				214	# Deletes the given subelement.
				215	#
				216	# @param index What subelement to delete.
				217	# @exception IndexError If the given element does not exist.
				218
				219	def __delitem__(self, index):
				220	del self._children[index]
				221
				222	##
				223	# Returns a list containing subelements in the given range.
				224	#
				225	# @param start The first subelement to return.
				226	# @param stop The first subelement that shouldn't be returned.
				227	# @return A sequence object containing subelements.
				228
				229	def __getslice__(self, start, stop):
				230	return self._children[start:stop]
				231
				232	##
				233	# Replaces a number of subelements with elements from a sequence.
				234	#
				235	# @param start The first subelement to replace.
				236	# @param stop The first subelement that shouldn't be replaced.
				237	# @param elements A sequence object with zero or more elements.
				238	# @exception AssertionError If a sequence member is not a valid object.
				239
				240	def __setslice__(self, start, stop, elements):
				241	for element in elements:
				242	assert iselement(element)
				243	self._children[start:stop] = list(elements)
				244
				245	##
				246	# Deletes a number of subelements.
				247	#
				248	# @param start The first subelement to delete.
				249	# @param stop The first subelement to leave in there.
				250
				251	def __delslice__(self, start, stop):
				252	del self._children[start:stop]
				253
				254	##
				255	# Adds a subelement to the end of this element.
				256	#
				257	# @param element The element to add.
				258	# @exception AssertionError If a sequence member is not a valid object.
				259
				260	def append(self, element):
				261	assert iselement(element)
				262	self._children.append(element)
				263
				264	##
				265	# Inserts a subelement at the given position in this element.
				266	#
				267	# @param index Where to insert the new subelement.
				268	# @exception AssertionError If the element is not a valid object.
				269
				270	def insert(self, index, element):
				271	assert iselement(element)
				272	self._children.insert(index, element)
				273
				274	##
				275	# Removes a matching subelement. Unlike the <b>find</b> methods,
				276	# this method compares elements based on identity, not on tag
				277	# value or contents.
				278	#
				279	# @param element What element to remove.
				280	# @exception ValueError If a matching element could not be found.
				281	# @exception AssertionError If the element is not a valid object.
				282
				283	def remove(self, element):
				284	assert iselement(element)
				285	self._children.remove(element)
				286
				287	##
				288	# Returns all subelements. The elements are returned in document
				289	# order.
				290	#
				291	# @return A list of subelements.
				292	# @defreturn list of Element instances
				293
				294	def getchildren(self):
				295	return self._children
				296
				297	##
				298	# Finds the first matching subelement, by tag name or path.
				299	#
				300	# @param path What element to look for.
				301	# @return The first matching element, or None if no element was found.
				302	# @defreturn Element or None
				303
				304	def find(self, path):
				305	return ElementPath.find(self, path)
				306
				307	##
				308	# Finds text for the first matching subelement, by tag name or path.
				309	#
				310	# @param path What element to look for.
				311	# @param default What to return if the element was not found.
				312	# @return The text content of the first matching element, or the
				313	# default value no element was found. Note that if the element
				314	# has is found, but has no text content, this method returns an
				315	# empty string.
				316	# @defreturn string
				317
				318	def findtext(self, path, default=None):
				319	return ElementPath.findtext(self, path, default)
				320
				321	##
				322	# Finds all matching subelements, by tag name or path.
				323	#
				324	# @param path What element to look for.
				325	# @return A list or iterator containing all matching elements,
				326	# in document order.
				327	# @defreturn list of Element instances
				328
				329	def findall(self, path):
				330	return ElementPath.findall(self, path)
				331
				332	##
				333	# Resets an element. This function removes all subelements, clears
				334	# all attributes, and sets the text and tail attributes to None.
				335
				336	def clear(self):
				337	self.attrib.clear()
				338	self._children = []
				339	self.text = self.tail = None
				340
				341	##
				342	# Gets an element attribute.
				343	#
				344	# @param key What attribute to look for.
				345	# @param default What to return if the attribute was not found.
				346	# @return The attribute value, or the default value, if the
				347	# attribute was not found.
				348	# @defreturn string or None
				349
				350	def get(self, key, default=None):
				351	return self.attrib.get(key, default)
				352
				353	##
				354	# Sets an element attribute.
				355	#
				356	# @param key What attribute to set.
				357	# @param value The attribute value.
				358
				359	def set(self, key, value):
				360	self.attrib[key] = value
				361
				362	##
				363	# Gets a list of attribute names. The names are returned in an
				364	# arbitrary order (just like for an ordinary Python dictionary).
				365	#
				366	# @return A list of element attribute names.
				367	# @defreturn list of strings
				368
				369	def keys(self):
				370	return self.attrib.keys()
				371
				372	##
				373	# Gets element attributes, as a sequence. The attributes are
				374	# returned in an arbitrary order.
				375	#
				376	# @return A list of (name, value) tuples for all attributes.
				377	# @defreturn list of (string, string) tuples
				378
				379	def items(self):
				380	return self.attrib.items()
				381
				382	##
				383	# Creates a tree iterator. The iterator loops over this element
				384	# and all subelements, in document order, and returns all elements
				385	# with a matching tag.
				386	# <p>
				387	# If the tree structure is modified during iteration, the result
				388	# is undefined.
				389	#
				390	# @param tag What tags to look for (default is to return all elements).
				391	# @return A list or iterator containing all the matching elements.
				392	# @defreturn list or iterator
				393
				394	def getiterator(self, tag=None):
				395	nodes = []
				396	if tag == "*":
				397	tag = None
				398	if tag is None or self.tag == tag:
				399	nodes.append(self)
				400	for node in self._children:
				401	nodes.extend(node.getiterator(tag))
				402	return nodes
				403
				404	# compatibility
				405	_Element = _ElementInterface
				406
				407	##
				408	# Element factory. This function returns an object implementing the
				409	# standard Element interface. The exact class or type of that object
				410	# is implementation dependent, but it will always be compatible with
				411	# the {@link #_ElementInterface} class in this module.
				412	# <p>
				413	# The element name, attribute names, and attribute values can be
				414	# either 8-bit ASCII strings or Unicode strings.
				415	#
				416	# @param tag The element name.
				417	# @param attrib An optional dictionary, containing element attributes.
				418	# @param **extra Additional attributes, given as keyword arguments.
				419	# @return An element instance.
				420	# @defreturn Element
				421
				422	def Element(tag, attrib={}, **extra):
				423	attrib = attrib.copy()
				424	attrib.update(extra)
				425	return _ElementInterface(tag, attrib)
				426
				427	##
				428	# Subelement factory. This function creates an element instance, and
				429	# appends it to an existing element.
				430	# <p>
				431	# The element name, attribute names, and attribute values can be
				432	# either 8-bit ASCII strings or Unicode strings.
				433	#
				434	# @param parent The parent element.
				435	# @param tag The subelement name.
				436	# @param attrib An optional dictionary, containing element attributes.
				437	# @param **extra Additional attributes, given as keyword arguments.
				438	# @return An element instance.
				439	# @defreturn Element
				440
				441	def SubElement(parent, tag, attrib={}, **extra):
				442	attrib = attrib.copy()
				443	attrib.update(extra)
				444	element = parent.makeelement(tag, attrib)
				445	parent.append(element)
				446	return element
				447
				448	##
				449	# Comment element factory. This factory function creates a special
				450	# element that will be serialized as an XML comment.
				451	# <p>
				452	# The comment string can be either an 8-bit ASCII string or a Unicode
				453	# string.
				454	#
				455	# @param text A string containing the comment string.
				456	# @return An element instance, representing a comment.
				457	# @defreturn Element
				458
				459	def Comment(text=None):
				460	element = Element(Comment)
				461	element.text = text
				462	return element
				463
				464	##
				465	# PI element factory. This factory function creates a special element
				466	# that will be serialized as an XML processing instruction.
				467	#
				468	# @param target A string containing the PI target.
				469	# @param text A string containing the PI contents, if any.
				470	# @return An element instance, representing a PI.
				471	# @defreturn Element
				472
				473	def ProcessingInstruction(target, text=None):
				474	element = Element(ProcessingInstruction)
				475	element.text = target
				476	if text:
				477	element.text = element.text + " " + text
				478	return element
				479
				480	PI = ProcessingInstruction
				481
				482	##
				483	# QName wrapper. This can be used to wrap a QName attribute value, in
				484	# order to get proper namespace handling on output.
				485	#
				486	# @param text A string containing the QName value, in the form {uri}local,
				487	# or, if the tag argument is given, the URI part of a QName.
				488	# @param tag Optional tag. If given, the first argument is interpreted as
				489	# an URI, and this argument is interpreted as a local name.
				490	# @return An opaque object, representing the QName.
				491
				492	class QName:
				493	def __init__(self, text_or_uri, tag=None):
				494	if tag:
				495	text_or_uri = "{%s}%s" % (text_or_uri, tag)
				496	self.text = text_or_uri
				497	def __str__(self):
				498	return self.text
				499	def __hash__(self):
				500	return hash(self.text)
				501	def __cmp__(self, other):
				502	if isinstance(other, QName):
				503	return cmp(self.text, other.text)
				504	return cmp(self.text, other)
				505
				506	##
				507	# ElementTree wrapper class. This class represents an entire element
				508	# hierarchy, and adds some extra support for serialization to and from
				509	# standard XML.
				510	#
				511	# @param element Optional root element.
				512	# @keyparam file Optional file handle or name. If given, the
				513	# tree is initialized with the contents of this XML file.
				514
				515	class ElementTree:
				516
				517	def __init__(self, element=None, file=None):
				518	assert element is None or iselement(element)
				519	self._root = element # first node
				520	if file:
				521	self.parse(file)
				522
				523	##
				524	# Gets the root element for this tree.
				525	#
				526	# @return An element instance.
				527	# @defreturn Element
				528
				529	def getroot(self):
				530	return self._root
				531
				532	##
				533	# Replaces the root element for this tree. This discards the
				534	# current contents of the tree, and replaces it with the given
				535	# element. Use with care.
				536	#
				537	# @param element An element instance.
				538
				539	def _setroot(self, element):
				540	assert iselement(element)
				541	self._root = element
				542
				543	##
				544	# Loads an external XML document into this element tree.
				545	#
				546	# @param source A file name or file object.
				547	# @param parser An optional parser instance. If not given, the
				548	# standard {@link XMLTreeBuilder} parser is used.
				549	# @return The document root element.
				550	# @defreturn Element
				551
				552	def parse(self, source, parser=None):
				553	if not hasattr(source, "read"):
				554	source = open(source, "rb")
				555	if not parser:
				556	parser = XMLTreeBuilder()
				557	while 1:
				558	data = source.read(32768)
				559	if not data:
				560	break
				561	parser.feed(data)
				562	self._root = parser.close()
				563	return self._root
				564
				565	##
				566	# Creates a tree iterator for the root element. The iterator loops
				567	# over all elements in this tree, in document order.
				568	#
				569	# @param tag What tags to look for (default is to return all elements)
				570	# @return An iterator.
				571	# @defreturn iterator
				572
				573	def getiterator(self, tag=None):
				574	assert self._root is not None
				575	return self._root.getiterator(tag)
				576
				577	##
				578	# Finds the first toplevel element with given tag.
				579	# Same as getroot().find(path).
				580	#
				581	# @param path What element to look for.
				582	# @return The first matching element, or None if no element was found.
				583	# @defreturn Element or None
				584
				585	def find(self, path):
				586	assert self._root is not None
				587	if path[:1] == "/":
				588	path = "." + path
				589	return self._root.find(path)
				590
				591	##
				592	# Finds the element text for the first toplevel element with given
				593	# tag. Same as getroot().findtext(path).
				594	#
				595	# @param path What toplevel element to look for.
				596	# @param default What to return if the element was not found.
				597	# @return The text content of the first matching element, or the
				598	# default value no element was found. Note that if the element
				599	# has is found, but has no text content, this method returns an
				600	# empty string.
				601	# @defreturn string
				602
				603	def findtext(self, path, default=None):
				604	assert self._root is not None
				605	if path[:1] == "/":
				606	path = "." + path
				607	return self._root.findtext(path, default)
				608
				609	##
				610	# Finds all toplevel elements with the given tag.
				611	# Same as getroot().findall(path).
				612	#
				613	# @param path What element to look for.
				614	# @return A list or iterator containing all matching elements,
				615	# in document order.
				616	# @defreturn list of Element instances
				617
				618	def findall(self, path):
				619	assert self._root is not None
				620	if path[:1] == "/":
				621	path = "." + path
				622	return self._root.findall(path)
				623
				624	##
				625	# Writes the element tree to a file, as XML.
				626	#
				627	# @param file A file name, or a file object opened for writing.
				628	# @param encoding Optional output encoding (default is US-ASCII).
				629
				630	def write(self, file, encoding="us-ascii"):
				631	assert self._root is not None
				632	if not hasattr(file, "write"):
				633	file = open(file, "wb")
				634	if not encoding:
				635	encoding = "us-ascii"
				636	elif encoding != "utf-8" and encoding != "us-ascii":
				637	file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
				638	self._write(file, self._root, encoding, {})
				639
				640	def _write(self, file, node, encoding, namespaces):
				641	# write XML to file
				642	tag = node.tag
				643	if tag is Comment:
				644	file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
				645	elif tag is ProcessingInstruction:
				646	file.write("<?%s?>" % _escape_cdata(node.text, encoding))
				647	else:
Guido van Rossum	cc2b016	2007-02-11 06:12:03 +0000	[diff] [blame]	648	items = list(node.items())
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	649	xmlns_items = [] # new namespaces in this scope
				650	try:
				651	if isinstance(tag, QName) or tag[:1] == "{":
				652	tag, xmlns = fixtag(tag, namespaces)
				653	if xmlns: xmlns_items.append(xmlns)
				654	except TypeError:
				655	_raise_serialization_error(tag)
				656	file.write("<" + _encode(tag, encoding))
				657	if items or xmlns_items:
				658	items.sort() # lexical order
				659	for k, v in items:
				660	try:
				661	if isinstance(k, QName) or k[:1] == "{":
				662	k, xmlns = fixtag(k, namespaces)
				663	if xmlns: xmlns_items.append(xmlns)
				664	except TypeError:
				665	_raise_serialization_error(k)
				666	try:
				667	if isinstance(v, QName):
				668	v, xmlns = fixtag(v, namespaces)
				669	if xmlns: xmlns_items.append(xmlns)
				670	except TypeError:
				671	_raise_serialization_error(v)
				672	file.write(" %s=\"%s\"" % (_encode(k, encoding),
				673	_escape_attrib(v, encoding)))
				674	for k, v in xmlns_items:
				675	file.write(" %s=\"%s\"" % (_encode(k, encoding),
				676	_escape_attrib(v, encoding)))
				677	if node.text or len(node):
				678	file.write(">")
				679	if node.text:
				680	file.write(_escape_cdata(node.text, encoding))
				681	for n in node:
				682	self._write(file, n, encoding, namespaces)
				683	file.write("</" + _encode(tag, encoding) + ">")
				684	else:
				685	file.write(" />")
				686	for k, v in xmlns_items:
				687	del namespaces[v]
				688	if node.tail:
				689	file.write(_escape_cdata(node.tail, encoding))
				690
				691	# --------------------------------------------------------------------
				692	# helpers
				693
				694	##
				695	# Checks if an object appears to be a valid element object.
				696	#
				697	# @param An element instance.
				698	# @return A true value if this is an element object.
				699	# @defreturn flag
				700
				701	def iselement(element):
				702	# FIXME: not sure about this; might be a better idea to look
				703	# for tag/attrib/text attributes
				704	return isinstance(element, _ElementInterface) or hasattr(element, "tag")
				705
				706	##
				707	# Writes an element tree or element structure to sys.stdout. This
				708	# function should be used for debugging only.
				709	# <p>
				710	# The exact output format is implementation dependent. In this
				711	# version, it's written as an ordinary XML file.
				712	#
				713	# @param elem An element tree or an individual element.
				714
				715	def dump(elem):
				716	# debugging
				717	if not isinstance(elem, ElementTree):
				718	elem = ElementTree(elem)
				719	elem.write(sys.stdout)
				720	tail = elem.getroot().tail
				721	if not tail or tail[-1] != "\n":
				722	sys.stdout.write("\n")
				723
				724	def _encode(s, encoding):
				725	try:
				726	return s.encode(encoding)
				727	except AttributeError:
				728	return s # 1.5.2: assume the string uses the right encoding
				729
				730	if sys.version[:3] == "1.5":
				731	_escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
				732	else:
				733	_escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
				734
				735	_escape_map = {
				736	"&": "&",
				737	"<": "<",
				738	">": ">",
				739	'"': """,
				740	}
				741
				742	_namespace_map = {
				743	# "well-known" namespace prefixes
				744	"http://www.w3.org/XML/1998/namespace": "xml",
				745	"http://www.w3.org/1999/xhtml": "html",
				746	"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
				747	"http://schemas.xmlsoap.org/wsdl/": "wsdl",
				748	}
				749
				750	def _raise_serialization_error(text):
				751	raise TypeError(
				752	"cannot serialize %r (type %s)" % (text, type(text).__name__)
				753	)
				754
				755	def _encode_entity(text, pattern=_escape):
				756	# map reserved and non-ascii characters to numerical entities
				757	def escape_entities(m, map=_escape_map):
				758	out = []
				759	append = out.append
				760	for char in m.group():
				761	text = map.get(char)
				762	if text is None:
				763	text = "&#%d;" % ord(char)
				764	append(text)
Neal Norwitz	9d72bb4	2007-04-17 08:48:32 +0000	[diff] [blame]	765	return "".join(out)
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	766	try:
				767	return _encode(pattern.sub(escape_entities, text), "ascii")
				768	except TypeError:
				769	_raise_serialization_error(text)
				770
				771	#
				772	# the following functions assume an ascii-compatible encoding
				773	# (or "utf-16")
				774
Neal Norwitz	9d72bb4	2007-04-17 08:48:32 +0000	[diff] [blame]	775	def _escape_cdata(text, encoding=None):
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	776	# escape character data
				777	try:
				778	if encoding:
				779	try:
				780	text = _encode(text, encoding)
				781	except UnicodeError:
				782	return _encode_entity(text)
Neal Norwitz	9d72bb4	2007-04-17 08:48:32 +0000	[diff] [blame]	783	text = text.replace("&", "&")
				784	text = text.replace("<", "<")
				785	text = text.replace(">", ">")
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	786	return text
				787	except (TypeError, AttributeError):
				788	_raise_serialization_error(text)
				789
Neal Norwitz	9d72bb4	2007-04-17 08:48:32 +0000	[diff] [blame]	790	def _escape_attrib(text, encoding=None):
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	791	# escape attribute value
				792	try:
				793	if encoding:
				794	try:
				795	text = _encode(text, encoding)
				796	except UnicodeError:
				797	return _encode_entity(text)
Neal Norwitz	9d72bb4	2007-04-17 08:48:32 +0000	[diff] [blame]	798	text = text.replace("&", "&")
				799	text = text.replace("'", "'") # FIXME: overkill
				800	text = text.replace("\"", """)
				801	text = text.replace("<", "<")
				802	text = text.replace(">", ">")
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	803	return text
				804	except (TypeError, AttributeError):
				805	_raise_serialization_error(text)
				806
				807	def fixtag(tag, namespaces):
				808	# given a decorated tag (of the form {uri}tag), return prefixed
				809	# tag and namespace declaration, if any
				810	if isinstance(tag, QName):
				811	tag = tag.text
Neal Norwitz	9d72bb4	2007-04-17 08:48:32 +0000	[diff] [blame]	812	namespace_uri, tag = tag[1:].split("}", 1)
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	813	prefix = namespaces.get(namespace_uri)
				814	if prefix is None:
				815	prefix = _namespace_map.get(namespace_uri)
				816	if prefix is None:
				817	prefix = "ns%d" % len(namespaces)
				818	namespaces[namespace_uri] = prefix
				819	if prefix == "xml":
				820	xmlns = None
				821	else:
				822	xmlns = ("xmlns:%s" % prefix, namespace_uri)
				823	else:
				824	xmlns = None
				825	return "%s:%s" % (prefix, tag), xmlns
				826
				827	##
				828	# Parses an XML document into an element tree.
				829	#
				830	# @param source A filename or file object containing XML data.
				831	# @param parser An optional parser instance. If not given, the
				832	# standard {@link XMLTreeBuilder} parser is used.
				833	# @return An ElementTree instance
				834
				835	def parse(source, parser=None):
				836	tree = ElementTree()
				837	tree.parse(source, parser)
				838	return tree
				839
				840	##
				841	# Parses an XML document into an element tree incrementally, and reports
				842	# what's going on to the user.
				843	#
				844	# @param source A filename or file object containing XML data.
				845	# @param events A list of events to report back. If omitted, only "end"
				846	# events are reported.
				847	# @return A (event, elem) iterator.
				848
				849	class iterparse:
				850
				851	def __init__(self, source, events=None):
				852	if not hasattr(source, "read"):
				853	source = open(source, "rb")
				854	self._file = source
				855	self._events = []
				856	self._index = 0
				857	self.root = self._root = None
				858	self._parser = XMLTreeBuilder()
				859	# wire up the parser for event reporting
				860	parser = self._parser._parser
				861	append = self._events.append
				862	if events is None:
				863	events = ["end"]
				864	for event in events:
				865	if event == "start":
				866	try:
				867	parser.ordered_attributes = 1
				868	parser.specified_attributes = 1
				869	def handler(tag, attrib_in, event=event, append=append,
				870	start=self._parser._start_list):
				871	append((event, start(tag, attrib_in)))
				872	parser.StartElementHandler = handler
				873	except AttributeError:
				874	def handler(tag, attrib_in, event=event, append=append,
				875	start=self._parser._start):
				876	append((event, start(tag, attrib_in)))
				877	parser.StartElementHandler = handler
				878	elif event == "end":
				879	def handler(tag, event=event, append=append,
				880	end=self._parser._end):
				881	append((event, end(tag)))
				882	parser.EndElementHandler = handler
				883	elif event == "start-ns":
				884	def handler(prefix, uri, event=event, append=append):
				885	try:
				886	uri = _encode(uri, "ascii")
				887	except UnicodeError:
				888	pass
				889	append((event, (prefix or "", uri)))
				890	parser.StartNamespaceDeclHandler = handler
				891	elif event == "end-ns":
				892	def handler(prefix, event=event, append=append):
				893	append((event, None))
				894	parser.EndNamespaceDeclHandler = handler
				895
Georg Brandl	a18af4e	2007-04-21 15:47:16 +0000	[diff] [blame]	896	def __next__(self):
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	897	while 1:
				898	try:
				899	item = self._events[self._index]
				900	except IndexError:
				901	if self._parser is None:
				902	self.root = self._root
				903	try:
				904	raise StopIteration
				905	except NameError:
				906	raise IndexError
				907	# load event buffer
				908	del self._events[:]
				909	self._index = 0
				910	data = self._file.read(16384)
				911	if data:
				912	self._parser.feed(data)
				913	else:
				914	self._root = self._parser.close()
				915	self._parser = None
				916	else:
				917	self._index = self._index + 1
				918	return item
				919
				920	try:
				921	iter
				922	def __iter__(self):
				923	return self
				924	except NameError:
				925	def __getitem__(self, index):
Georg Brandl	a18af4e	2007-04-21 15:47:16 +0000	[diff] [blame]	926	return self.__next__()
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	927
				928	##
				929	# Parses an XML document from a string constant. This function can
				930	# be used to embed "XML literals" in Python code.
				931	#
				932	# @param source A string containing XML data.
				933	# @return An Element instance.
				934	# @defreturn Element
				935
				936	def XML(text):
				937	parser = XMLTreeBuilder()
				938	parser.feed(text)
				939	return parser.close()
				940
				941	##
				942	# Parses an XML document from a string constant, and also returns
				943	# a dictionary which maps from element id:s to elements.
				944	#
				945	# @param source A string containing XML data.
				946	# @return A tuple containing an Element instance and a dictionary.
				947	# @defreturn (Element, dictionary)
				948
				949	def XMLID(text):
				950	parser = XMLTreeBuilder()
				951	parser.feed(text)
				952	tree = parser.close()
				953	ids = {}
				954	for elem in tree.getiterator():
				955	id = elem.get("id")
				956	if id:
				957	ids[id] = elem
				958	return tree, ids
				959
				960	##
				961	# Parses an XML document from a string constant. Same as {@link #XML}.
				962	#
				963	# @def fromstring(text)
				964	# @param source A string containing XML data.
				965	# @return An Element instance.
				966	# @defreturn Element
				967
				968	fromstring = XML
				969
				970	##
				971	# Generates a string representation of an XML element, including all
				972	# subelements.
				973	#
				974	# @param element An Element instance.
				975	# @return An encoded string containing the XML data.
				976	# @defreturn string
				977
				978	def tostring(element, encoding=None):
				979	class dummy:
				980	pass
				981	data = []
				982	file = dummy()
				983	file.write = data.append
				984	ElementTree(element).write(file, encoding)
Neal Norwitz	9d72bb4	2007-04-17 08:48:32 +0000	[diff] [blame]	985	return "".join(data)
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	986
				987	##
				988	# Generic element structure builder. This builder converts a sequence
				989	# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
				990	# #TreeBuilder.end} method calls to a well-formed element structure.
				991	# <p>
				992	# You can use this class to build an element structure using a custom XML
				993	# parser, or a parser for some other XML-like format.
				994	#
				995	# @param element_factory Optional element factory. This factory
				996	# is called to create new Element instances, as necessary.
				997
				998	class TreeBuilder:
				999
				1000	def __init__(self, element_factory=None):
				1001	self._data = [] # data collector
				1002	self._elem = [] # element stack
				1003	self._last = None # last element
				1004	self._tail = None # true if we're after an end tag
				1005	if element_factory is None:
				1006	element_factory = _ElementInterface
				1007	self._factory = element_factory
				1008
				1009	##
				1010	# Flushes the parser buffers, and returns the toplevel documen
				1011	# element.
				1012	#
				1013	# @return An Element instance.
				1014	# @defreturn Element
				1015
				1016	def close(self):
				1017	assert len(self._elem) == 0, "missing end tags"
				1018	assert self._last != None, "missing toplevel element"
				1019	return self._last
				1020
				1021	def _flush(self):
				1022	if self._data:
				1023	if self._last is not None:
Neal Norwitz	9d72bb4	2007-04-17 08:48:32 +0000	[diff] [blame]	1024	text = "".join(self._data)
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	1025	if self._tail:
				1026	assert self._last.tail is None, "internal error (tail)"
				1027	self._last.tail = text
				1028	else:
				1029	assert self._last.text is None, "internal error (text)"
				1030	self._last.text = text
				1031	self._data = []
				1032
				1033	##
				1034	# Adds text to the current element.
				1035	#
				1036	# @param data A string. This should be either an 8-bit string
				1037	# containing ASCII text, or a Unicode string.
				1038
				1039	def data(self, data):
				1040	self._data.append(data)
				1041
				1042	##
				1043	# Opens a new element.
				1044	#
				1045	# @param tag The element name.
				1046	# @param attrib A dictionary containing element attributes.
				1047	# @return The opened element.
				1048	# @defreturn Element
				1049
				1050	def start(self, tag, attrs):
				1051	self._flush()
				1052	self._last = elem = self._factory(tag, attrs)
				1053	if self._elem:
				1054	self._elem[-1].append(elem)
				1055	self._elem.append(elem)
				1056	self._tail = 0
				1057	return elem
				1058
				1059	##
				1060	# Closes the current element.
				1061	#
				1062	# @param tag The element name.
				1063	# @return The closed element.
				1064	# @defreturn Element
				1065
				1066	def end(self, tag):
				1067	self._flush()
				1068	self._last = self._elem.pop()
				1069	assert self._last.tag == tag,\
				1070	"end tag mismatch (expected %s, got %s)" % (
				1071	self._last.tag, tag)
				1072	self._tail = 1
				1073	return self._last
				1074
				1075	##
				1076	# Element structure builder for XML source data, based on the
				1077	# <b>expat</b> parser.
				1078	#
				1079	# @keyparam target Target object. If omitted, the builder uses an
				1080	# instance of the standard {@link #TreeBuilder} class.
				1081	# @keyparam html Predefine HTML entities. This flag is not supported
				1082	# by the current implementation.
				1083	# @see #ElementTree
				1084	# @see #TreeBuilder
				1085
				1086	class XMLTreeBuilder:
				1087
				1088	def __init__(self, html=0, target=None):
				1089	try:
Thomas Wouters	0e3f591	2006-08-11 14:57:12 +0000	[diff] [blame]	1090	from xml.parsers import expat
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	1091	except ImportError:
				1092	raise ImportError(
				1093	"No module named expat; use SimpleXMLTreeBuilder instead"
				1094	)
				1095	self._parser = parser = expat.ParserCreate(None, "}")
				1096	if target is None:
				1097	target = TreeBuilder()
				1098	self._target = target
				1099	self._names = {} # name memo cache
				1100	# callbacks
				1101	parser.DefaultHandlerExpand = self._default
				1102	parser.StartElementHandler = self._start
				1103	parser.EndElementHandler = self._end
				1104	parser.CharacterDataHandler = self._data
				1105	# let expat do the buffering, if supported
				1106	try:
				1107	self._parser.buffer_text = 1
				1108	except AttributeError:
				1109	pass
				1110	# use new-style attribute handling, if supported
				1111	try:
				1112	self._parser.ordered_attributes = 1
				1113	self._parser.specified_attributes = 1
				1114	parser.StartElementHandler = self._start_list
				1115	except AttributeError:
				1116	pass
				1117	encoding = None
				1118	if not parser.returns_unicode:
				1119	encoding = "utf-8"
				1120	# target.xml(encoding, None)
				1121	self._doctype = None
				1122	self.entity = {}
				1123
				1124	def _fixtext(self, text):
				1125	# convert text string to ascii, if possible
				1126	try:
				1127	return _encode(text, "ascii")
				1128	except UnicodeError:
				1129	return text
				1130
				1131	def _fixname(self, key):
				1132	# expand qname, and convert name string to ascii, if possible
				1133	try:
				1134	name = self._names[key]
				1135	except KeyError:
				1136	name = key
				1137	if "}" in name:
				1138	name = "{" + name
				1139	self._names[key] = name = self._fixtext(name)
				1140	return name
				1141
				1142	def _start(self, tag, attrib_in):
				1143	fixname = self._fixname
				1144	tag = fixname(tag)
				1145	attrib = {}
				1146	for key, value in attrib_in.items():
				1147	attrib[fixname(key)] = self._fixtext(value)
				1148	return self._target.start(tag, attrib)
				1149
				1150	def _start_list(self, tag, attrib_in):
				1151	fixname = self._fixname
				1152	tag = fixname(tag)
				1153	attrib = {}
				1154	if attrib_in:
				1155	for i in range(0, len(attrib_in), 2):
				1156	attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
				1157	return self._target.start(tag, attrib)
				1158
				1159	def _data(self, text):
				1160	return self._target.data(self._fixtext(text))
				1161
				1162	def _end(self, tag):
				1163	return self._target.end(self._fixname(tag))
				1164
				1165	def _default(self, text):
				1166	prefix = text[:1]
				1167	if prefix == "&":
				1168	# deal with undefined entities
				1169	try:
				1170	self._target.data(self.entity[text[1:-1]])
				1171	except KeyError:
Thomas Wouters	0e3f591	2006-08-11 14:57:12 +0000	[diff] [blame]	1172	from xml.parsers import expat
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	1173	raise expat.error(
				1174	"undefined entity %s: line %d, column %d" %
				1175	(text, self._parser.ErrorLineNumber,
				1176	self._parser.ErrorColumnNumber)
				1177	)
				1178	elif prefix == "<" and text[:9] == "<!DOCTYPE":
				1179	self._doctype = [] # inside a doctype declaration
				1180	elif self._doctype is not None:
				1181	# parse doctype contents
				1182	if prefix == ">":
				1183	self._doctype = None
				1184	return
Neal Norwitz	9d72bb4	2007-04-17 08:48:32 +0000	[diff] [blame]	1185	text = text.strip()
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	1186	if not text:
				1187	return
				1188	self._doctype.append(text)
				1189	n = len(self._doctype)
				1190	if n > 2:
				1191	type = self._doctype[1]
				1192	if type == "PUBLIC" and n == 4:
				1193	name, type, pubid, system = self._doctype
				1194	elif type == "SYSTEM" and n == 3:
				1195	name, type, system = self._doctype
				1196	pubid = None
				1197	else:
				1198	return
				1199	if pubid:
				1200	pubid = pubid[1:-1]
				1201	self.doctype(name, pubid, system[1:-1])
				1202	self._doctype = None
				1203
				1204	##
				1205	# Handles a doctype declaration.
				1206	#
				1207	# @param name Doctype name.
				1208	# @param pubid Public identifier.
				1209	# @param system System identifier.
				1210
				1211	def doctype(self, name, pubid, system):
				1212	pass
				1213
				1214	##
				1215	# Feeds data to the parser.
				1216	#
				1217	# @param data Encoded data.
				1218
				1219	def feed(self, data):
				1220	self._parser.Parse(data, 0)
				1221
				1222	##
				1223	# Finishes feeding data to the parser.
				1224	#
				1225	# @return An element structure.
				1226	# @defreturn Element
				1227
				1228	def close(self):
				1229	self._parser.Parse("", 1) # end of data
				1230	tree = self._target.close()
				1231	del self._target, self._parser # get rid of circular references
				1232	return tree
Thomas Wouters	0e3f591	2006-08-11 14:57:12 +0000	[diff] [blame]	1233
				1234	# compatibility
				1235	XMLParser = XMLTreeBuilder