Blame - Lib/xml/etree/ElementTree.py - platform/external/python/cpython2

blob: 7dbc72e78f958971268f4eec3ddbbbced2ee0070 [file] [log] [blame]

Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	1	#
				2	# ElementTree
				3	# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
				4	#
				5	# light-weight XML support for Python 1.5.2 and later.
				6	#
				7	# history:
				8	# 2001-10-20 fl created (from various sources)
				9	# 2001-11-01 fl return root from parse method
				10	# 2002-02-16 fl sort attributes in lexical order
				11	# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
				12	# 2002-05-01 fl finished TreeBuilder refactoring
				13	# 2002-07-14 fl added basic namespace support to ElementTree.write
				14	# 2002-07-25 fl added QName attribute support
				15	# 2002-10-20 fl fixed encoding in write
				16	# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
				17	# 2002-11-27 fl accept file objects or file names for parse/write
				18	# 2002-12-04 fl moved XMLTreeBuilder back to this module
				19	# 2003-01-11 fl fixed entity encoding glitch for us-ascii
				20	# 2003-02-13 fl added XML literal factory
				21	# 2003-02-21 fl added ProcessingInstruction/PI factory
				22	# 2003-05-11 fl added tostring/fromstring helpers
				23	# 2003-05-26 fl added ElementPath support
				24	# 2003-07-05 fl added makeelement factory method
				25	# 2003-07-28 fl added more well-known namespace prefixes
				26	# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
				27	# 2003-09-04 fl fall back on emulator if ElementPath is not installed
				28	# 2003-10-31 fl markup updates
				29	# 2003-11-15 fl fixed nested namespace bug
				30	# 2004-03-28 fl added XMLID helper
				31	# 2004-06-02 fl added default support to findtext
				32	# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
				33	# 2004-08-23 fl take advantage of post-2.1 expat features
				34	# 2005-02-01 fl added iterparse implementation
				35	# 2005-03-02 fl fixed iterparse support for pre-2.2 versions
				36	#
				37	# Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved.
				38	#
				39	# fredrik@pythonware.com
				40	# http://www.pythonware.com
				41	#
				42	# --------------------------------------------------------------------
				43	# The ElementTree toolkit is
				44	#
				45	# Copyright (c) 1999-2005 by Fredrik Lundh
				46	#
				47	# By obtaining, using, and/or copying this software and/or its
				48	# associated documentation, you agree that you have read, understood,
				49	# and will comply with the following terms and conditions:
				50	#
				51	# Permission to use, copy, modify, and distribute this software and
				52	# its associated documentation for any purpose and without fee is
				53	# hereby granted, provided that the above copyright notice appears in
				54	# all copies, and that both that copyright notice and this permission
				55	# notice appear in supporting documentation, and that the name of
				56	# Secret Labs AB or the author not be used in advertising or publicity
				57	# pertaining to distribution of the software without specific, written
				58	# prior permission.
				59	#
				60	# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
				61	# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
				62	# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
				63	# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
				64	# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
				65	# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
				66	# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
				67	# OF THIS SOFTWARE.
				68	# --------------------------------------------------------------------
				69
Fredrik Lundh	63168a5	2005-12-14 22:29:34 +0000	[diff] [blame]	70	# Licensed to PSF under a Contributor Agreement.
				71	# See http://www.python.org/2.4/license for licensing details.
				72
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	73	__all__ = [
				74	# public symbols
				75	"Comment",
				76	"dump",
				77	"Element", "ElementTree",
				78	"fromstring",
				79	"iselement", "iterparse",
				80	"parse",
				81	"PI", "ProcessingInstruction",
				82	"QName",
				83	"SubElement",
				84	"tostring",
				85	"TreeBuilder",
				86	"VERSION", "XML",
Fredrik Lundh	bf84e54	2006-07-06 12:29:24 +0000	[diff] [blame]	87	"XMLParser", "XMLTreeBuilder",
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	88	]
				89
				90	##
				91	# The <b>Element</b> type is a flexible container object, designed to
				92	# store hierarchical data structures in memory. The type can be
				93	# described as a cross between a list and a dictionary.
				94	# <p>
				95	# Each element has a number of properties associated with it:
				96	# <ul>
				97	# <li>a <i>tag</i>. This is a string identifying what kind of data
				98	# this element represents (the element type, in other words).</li>
				99	# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
				100	# <li>a <i>text</i> string.</li>
				101	# <li>an optional <i>tail</i> string.</li>
				102	# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
				103	# </ul>
				104	#
				105	# To create an element instance, use the {@link #Element} or {@link
				106	# #SubElement} factory functions.
				107	# <p>
				108	# The {@link #ElementTree} class can be used to wrap an element
				109	# structure, and convert it from and to XML.
				110	##
				111
				112	import string, sys, re
				113
				114	class _SimpleElementPath:
				115	# emulate pre-1.2 find/findtext/findall behaviour
				116	def find(self, element, tag):
				117	for elem in element:
				118	if elem.tag == tag:
				119	return elem
				120	return None
				121	def findtext(self, element, tag, default=None):
				122	for elem in element:
				123	if elem.tag == tag:
				124	return elem.text or ""
				125	return default
				126	def findall(self, element, tag):
				127	if tag[:3] == ".//":
				128	return element.getiterator(tag[3:])
				129	result = []
				130	for elem in element:
				131	if elem.tag == tag:
				132	result.append(elem)
				133	return result
				134
				135	try:
				136	import ElementPath
				137	except ImportError:
				138	# FIXME: issue warning in this case?
				139	ElementPath = _SimpleElementPath()
				140
				141	# TODO: add support for custom namespace resolvers/default namespaces
				142	# TODO: add improved support for incremental parsing
				143
				144	VERSION = "1.2.6"
				145
				146	##
				147	# Internal element class. This class defines the Element interface,
				148	# and provides a reference implementation of this interface.
				149	# <p>
				150	# You should not create instances of this class directly. Use the
				151	# appropriate factory functions instead, such as {@link #Element}
				152	# and {@link #SubElement}.
				153	#
				154	# @see Element
				155	# @see SubElement
				156	# @see Comment
				157	# @see ProcessingInstruction
				158
				159	class _ElementInterface:
				160	# <tag attrib>text<child/>...</tag>tail
				161
				162	##
				163	# (Attribute) Element tag.
				164
				165	tag = None
				166
				167	##
				168	# (Attribute) Element attribute dictionary. Where possible, use
				169	# {@link #_ElementInterface.get},
				170	# {@link #_ElementInterface.set},
				171	# {@link #_ElementInterface.keys}, and
				172	# {@link #_ElementInterface.items} to access
				173	# element attributes.
				174
				175	attrib = None
				176
				177	##
				178	# (Attribute) Text before first subelement. This is either a
				179	# string or the value None, if there was no text.
				180
				181	text = None
				182
				183	##
				184	# (Attribute) Text after this element's end tag, but before the
				185	# next sibling element's start tag. This is either a string or
				186	# the value None, if there was no text.
				187
				188	tail = None # text after end tag, if any
				189
				190	def __init__(self, tag, attrib):
				191	self.tag = tag
				192	self.attrib = attrib
				193	self._children = []
				194
				195	def __repr__(self):
				196	return "<Element %s at %x>" % (self.tag, id(self))
				197
				198	##
				199	# Creates a new element object of the same type as this element.
				200	#
				201	# @param tag Element tag.
				202	# @param attrib Element attributes, given as a dictionary.
				203	# @return A new element instance.
				204
				205	def makeelement(self, tag, attrib):
				206	return Element(tag, attrib)
				207
				208	##
				209	# Returns the number of subelements.
				210	#
				211	# @return The number of subelements.
				212
				213	def __len__(self):
				214	return len(self._children)
				215
				216	##
				217	# Returns the given subelement.
				218	#
				219	# @param index What subelement to return.
				220	# @return The given subelement.
				221	# @exception IndexError If the given element does not exist.
				222
				223	def __getitem__(self, index):
				224	return self._children[index]
				225
				226	##
				227	# Replaces the given subelement.
				228	#
				229	# @param index What subelement to replace.
				230	# @param element The new element value.
				231	# @exception IndexError If the given element does not exist.
				232	# @exception AssertionError If element is not a valid object.
				233
				234	def __setitem__(self, index, element):
				235	assert iselement(element)
				236	self._children[index] = element
				237
				238	##
				239	# Deletes the given subelement.
				240	#
				241	# @param index What subelement to delete.
				242	# @exception IndexError If the given element does not exist.
				243
				244	def __delitem__(self, index):
				245	del self._children[index]
				246
				247	##
				248	# Returns a list containing subelements in the given range.
				249	#
				250	# @param start The first subelement to return.
				251	# @param stop The first subelement that shouldn't be returned.
				252	# @return A sequence object containing subelements.
				253
				254	def __getslice__(self, start, stop):
				255	return self._children[start:stop]
				256
				257	##
				258	# Replaces a number of subelements with elements from a sequence.
				259	#
				260	# @param start The first subelement to replace.
				261	# @param stop The first subelement that shouldn't be replaced.
				262	# @param elements A sequence object with zero or more elements.
				263	# @exception AssertionError If a sequence member is not a valid object.
				264
				265	def __setslice__(self, start, stop, elements):
				266	for element in elements:
				267	assert iselement(element)
				268	self._children[start:stop] = list(elements)
				269
				270	##
				271	# Deletes a number of subelements.
				272	#
				273	# @param start The first subelement to delete.
				274	# @param stop The first subelement to leave in there.
				275
				276	def __delslice__(self, start, stop):
				277	del self._children[start:stop]
				278
				279	##
				280	# Adds a subelement to the end of this element.
				281	#
				282	# @param element The element to add.
				283	# @exception AssertionError If a sequence member is not a valid object.
				284
				285	def append(self, element):
				286	assert iselement(element)
				287	self._children.append(element)
				288
				289	##
				290	# Inserts a subelement at the given position in this element.
				291	#
				292	# @param index Where to insert the new subelement.
				293	# @exception AssertionError If the element is not a valid object.
				294
				295	def insert(self, index, element):
				296	assert iselement(element)
				297	self._children.insert(index, element)
				298
				299	##
				300	# Removes a matching subelement. Unlike the <b>find</b> methods,
				301	# this method compares elements based on identity, not on tag
				302	# value or contents.
				303	#
				304	# @param element What element to remove.
				305	# @exception ValueError If a matching element could not be found.
				306	# @exception AssertionError If the element is not a valid object.
				307
				308	def remove(self, element):
				309	assert iselement(element)
				310	self._children.remove(element)
				311
				312	##
				313	# Returns all subelements. The elements are returned in document
				314	# order.
				315	#
				316	# @return A list of subelements.
				317	# @defreturn list of Element instances
				318
				319	def getchildren(self):
				320	return self._children
				321
				322	##
				323	# Finds the first matching subelement, by tag name or path.
				324	#
				325	# @param path What element to look for.
				326	# @return The first matching element, or None if no element was found.
				327	# @defreturn Element or None
				328
				329	def find(self, path):
				330	return ElementPath.find(self, path)
				331
				332	##
				333	# Finds text for the first matching subelement, by tag name or path.
				334	#
				335	# @param path What element to look for.
				336	# @param default What to return if the element was not found.
				337	# @return The text content of the first matching element, or the
				338	# default value no element was found. Note that if the element
				339	# has is found, but has no text content, this method returns an
				340	# empty string.
				341	# @defreturn string
				342
				343	def findtext(self, path, default=None):
				344	return ElementPath.findtext(self, path, default)
				345
				346	##
				347	# Finds all matching subelements, by tag name or path.
				348	#
				349	# @param path What element to look for.
				350	# @return A list or iterator containing all matching elements,
				351	# in document order.
				352	# @defreturn list of Element instances
				353
				354	def findall(self, path):
				355	return ElementPath.findall(self, path)
				356
				357	##
				358	# Resets an element. This function removes all subelements, clears
				359	# all attributes, and sets the text and tail attributes to None.
				360
				361	def clear(self):
				362	self.attrib.clear()
				363	self._children = []
				364	self.text = self.tail = None
				365
				366	##
				367	# Gets an element attribute.
				368	#
				369	# @param key What attribute to look for.
				370	# @param default What to return if the attribute was not found.
				371	# @return The attribute value, or the default value, if the
				372	# attribute was not found.
				373	# @defreturn string or None
				374
				375	def get(self, key, default=None):
				376	return self.attrib.get(key, default)
				377
				378	##
				379	# Sets an element attribute.
				380	#
				381	# @param key What attribute to set.
				382	# @param value The attribute value.
				383
				384	def set(self, key, value):
				385	self.attrib[key] = value
				386
				387	##
				388	# Gets a list of attribute names. The names are returned in an
				389	# arbitrary order (just like for an ordinary Python dictionary).
				390	#
				391	# @return A list of element attribute names.
				392	# @defreturn list of strings
				393
				394	def keys(self):
				395	return self.attrib.keys()
				396
				397	##
				398	# Gets element attributes, as a sequence. The attributes are
				399	# returned in an arbitrary order.
				400	#
				401	# @return A list of (name, value) tuples for all attributes.
				402	# @defreturn list of (string, string) tuples
				403
				404	def items(self):
				405	return self.attrib.items()
				406
				407	##
				408	# Creates a tree iterator. The iterator loops over this element
				409	# and all subelements, in document order, and returns all elements
				410	# with a matching tag.
				411	# <p>
				412	# If the tree structure is modified during iteration, the result
				413	# is undefined.
				414	#
				415	# @param tag What tags to look for (default is to return all elements).
				416	# @return A list or iterator containing all the matching elements.
				417	# @defreturn list or iterator
				418
				419	def getiterator(self, tag=None):
				420	nodes = []
				421	if tag == "*":
				422	tag = None
				423	if tag is None or self.tag == tag:
				424	nodes.append(self)
				425	for node in self._children:
				426	nodes.extend(node.getiterator(tag))
				427	return nodes
				428
				429	# compatibility
				430	_Element = _ElementInterface
				431
				432	##
				433	# Element factory. This function returns an object implementing the
				434	# standard Element interface. The exact class or type of that object
				435	# is implementation dependent, but it will always be compatible with
				436	# the {@link #_ElementInterface} class in this module.
				437	# <p>
				438	# The element name, attribute names, and attribute values can be
				439	# either 8-bit ASCII strings or Unicode strings.
				440	#
				441	# @param tag The element name.
				442	# @param attrib An optional dictionary, containing element attributes.
				443	# @param **extra Additional attributes, given as keyword arguments.
				444	# @return An element instance.
				445	# @defreturn Element
				446
				447	def Element(tag, attrib={}, **extra):
				448	attrib = attrib.copy()
				449	attrib.update(extra)
				450	return _ElementInterface(tag, attrib)
				451
				452	##
				453	# Subelement factory. This function creates an element instance, and
				454	# appends it to an existing element.
				455	# <p>
				456	# The element name, attribute names, and attribute values can be
				457	# either 8-bit ASCII strings or Unicode strings.
				458	#
				459	# @param parent The parent element.
				460	# @param tag The subelement name.
				461	# @param attrib An optional dictionary, containing element attributes.
				462	# @param **extra Additional attributes, given as keyword arguments.
				463	# @return An element instance.
				464	# @defreturn Element
				465
				466	def SubElement(parent, tag, attrib={}, **extra):
				467	attrib = attrib.copy()
				468	attrib.update(extra)
				469	element = parent.makeelement(tag, attrib)
				470	parent.append(element)
				471	return element
				472
				473	##
				474	# Comment element factory. This factory function creates a special
				475	# element that will be serialized as an XML comment.
				476	# <p>
				477	# The comment string can be either an 8-bit ASCII string or a Unicode
				478	# string.
				479	#
				480	# @param text A string containing the comment string.
				481	# @return An element instance, representing a comment.
				482	# @defreturn Element
				483
				484	def Comment(text=None):
				485	element = Element(Comment)
				486	element.text = text
				487	return element
				488
				489	##
				490	# PI element factory. This factory function creates a special element
				491	# that will be serialized as an XML processing instruction.
				492	#
				493	# @param target A string containing the PI target.
				494	# @param text A string containing the PI contents, if any.
				495	# @return An element instance, representing a PI.
				496	# @defreturn Element
				497
				498	def ProcessingInstruction(target, text=None):
				499	element = Element(ProcessingInstruction)
				500	element.text = target
				501	if text:
				502	element.text = element.text + " " + text
				503	return element
				504
				505	PI = ProcessingInstruction
				506
				507	##
				508	# QName wrapper. This can be used to wrap a QName attribute value, in
				509	# order to get proper namespace handling on output.
				510	#
				511	# @param text A string containing the QName value, in the form {uri}local,
				512	# or, if the tag argument is given, the URI part of a QName.
				513	# @param tag Optional tag. If given, the first argument is interpreted as
				514	# an URI, and this argument is interpreted as a local name.
				515	# @return An opaque object, representing the QName.
				516
				517	class QName:
				518	def __init__(self, text_or_uri, tag=None):
				519	if tag:
				520	text_or_uri = "{%s}%s" % (text_or_uri, tag)
				521	self.text = text_or_uri
				522	def __str__(self):
				523	return self.text
				524	def __hash__(self):
				525	return hash(self.text)
				526	def __cmp__(self, other):
				527	if isinstance(other, QName):
				528	return cmp(self.text, other.text)
				529	return cmp(self.text, other)
				530
				531	##
				532	# ElementTree wrapper class. This class represents an entire element
				533	# hierarchy, and adds some extra support for serialization to and from
				534	# standard XML.
				535	#
				536	# @param element Optional root element.
				537	# @keyparam file Optional file handle or name. If given, the
				538	# tree is initialized with the contents of this XML file.
				539
				540	class ElementTree:
				541
				542	def __init__(self, element=None, file=None):
				543	assert element is None or iselement(element)
				544	self._root = element # first node
				545	if file:
				546	self.parse(file)
				547
				548	##
				549	# Gets the root element for this tree.
				550	#
				551	# @return An element instance.
				552	# @defreturn Element
				553
				554	def getroot(self):
				555	return self._root
				556
				557	##
				558	# Replaces the root element for this tree. This discards the
				559	# current contents of the tree, and replaces it with the given
				560	# element. Use with care.
				561	#
				562	# @param element An element instance.
				563
				564	def _setroot(self, element):
				565	assert iselement(element)
				566	self._root = element
				567
				568	##
				569	# Loads an external XML document into this element tree.
				570	#
				571	# @param source A file name or file object.
				572	# @param parser An optional parser instance. If not given, the
				573	# standard {@link XMLTreeBuilder} parser is used.
				574	# @return The document root element.
				575	# @defreturn Element
				576
				577	def parse(self, source, parser=None):
				578	if not hasattr(source, "read"):
				579	source = open(source, "rb")
				580	if not parser:
				581	parser = XMLTreeBuilder()
				582	while 1:
				583	data = source.read(32768)
				584	if not data:
				585	break
				586	parser.feed(data)
				587	self._root = parser.close()
				588	return self._root
				589
				590	##
				591	# Creates a tree iterator for the root element. The iterator loops
				592	# over all elements in this tree, in document order.
				593	#
				594	# @param tag What tags to look for (default is to return all elements)
				595	# @return An iterator.
				596	# @defreturn iterator
				597
				598	def getiterator(self, tag=None):
				599	assert self._root is not None
				600	return self._root.getiterator(tag)
				601
				602	##
				603	# Finds the first toplevel element with given tag.
				604	# Same as getroot().find(path).
				605	#
				606	# @param path What element to look for.
				607	# @return The first matching element, or None if no element was found.
				608	# @defreturn Element or None
				609
				610	def find(self, path):
				611	assert self._root is not None
				612	if path[:1] == "/":
				613	path = "." + path
				614	return self._root.find(path)
				615
				616	##
				617	# Finds the element text for the first toplevel element with given
				618	# tag. Same as getroot().findtext(path).
				619	#
				620	# @param path What toplevel element to look for.
				621	# @param default What to return if the element was not found.
				622	# @return The text content of the first matching element, or the
				623	# default value no element was found. Note that if the element
				624	# has is found, but has no text content, this method returns an
				625	# empty string.
				626	# @defreturn string
				627
				628	def findtext(self, path, default=None):
				629	assert self._root is not None
				630	if path[:1] == "/":
				631	path = "." + path
				632	return self._root.findtext(path, default)
				633
				634	##
				635	# Finds all toplevel elements with the given tag.
				636	# Same as getroot().findall(path).
				637	#
				638	# @param path What element to look for.
				639	# @return A list or iterator containing all matching elements,
				640	# in document order.
				641	# @defreturn list of Element instances
				642
				643	def findall(self, path):
				644	assert self._root is not None
				645	if path[:1] == "/":
				646	path = "." + path
				647	return self._root.findall(path)
				648
				649	##
				650	# Writes the element tree to a file, as XML.
				651	#
				652	# @param file A file name, or a file object opened for writing.
				653	# @param encoding Optional output encoding (default is US-ASCII).
				654
				655	def write(self, file, encoding="us-ascii"):
				656	assert self._root is not None
				657	if not hasattr(file, "write"):
				658	file = open(file, "wb")
				659	if not encoding:
				660	encoding = "us-ascii"
				661	elif encoding != "utf-8" and encoding != "us-ascii":
				662	file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
				663	self._write(file, self._root, encoding, {})
				664
				665	def _write(self, file, node, encoding, namespaces):
				666	# write XML to file
				667	tag = node.tag
				668	if tag is Comment:
				669	file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
				670	elif tag is ProcessingInstruction:
				671	file.write("<?%s?>" % _escape_cdata(node.text, encoding))
				672	else:
				673	items = node.items()
				674	xmlns_items = [] # new namespaces in this scope
				675	try:
				676	if isinstance(tag, QName) or tag[:1] == "{":
				677	tag, xmlns = fixtag(tag, namespaces)
				678	if xmlns: xmlns_items.append(xmlns)
				679	except TypeError:
				680	_raise_serialization_error(tag)
				681	file.write("<" + _encode(tag, encoding))
				682	if items or xmlns_items:
				683	items.sort() # lexical order
				684	for k, v in items:
				685	try:
				686	if isinstance(k, QName) or k[:1] == "{":
				687	k, xmlns = fixtag(k, namespaces)
				688	if xmlns: xmlns_items.append(xmlns)
				689	except TypeError:
				690	_raise_serialization_error(k)
				691	try:
				692	if isinstance(v, QName):
				693	v, xmlns = fixtag(v, namespaces)
				694	if xmlns: xmlns_items.append(xmlns)
				695	except TypeError:
				696	_raise_serialization_error(v)
				697	file.write(" %s=\"%s\"" % (_encode(k, encoding),
				698	_escape_attrib(v, encoding)))
				699	for k, v in xmlns_items:
				700	file.write(" %s=\"%s\"" % (_encode(k, encoding),
				701	_escape_attrib(v, encoding)))
				702	if node.text or len(node):
				703	file.write(">")
				704	if node.text:
				705	file.write(_escape_cdata(node.text, encoding))
				706	for n in node:
				707	self._write(file, n, encoding, namespaces)
				708	file.write("</" + _encode(tag, encoding) + ">")
				709	else:
				710	file.write(" />")
				711	for k, v in xmlns_items:
				712	del namespaces[v]
				713	if node.tail:
				714	file.write(_escape_cdata(node.tail, encoding))
				715
				716	# --------------------------------------------------------------------
				717	# helpers
				718
				719	##
				720	# Checks if an object appears to be a valid element object.
				721	#
				722	# @param An element instance.
				723	# @return A true value if this is an element object.
				724	# @defreturn flag
				725
				726	def iselement(element):
				727	# FIXME: not sure about this; might be a better idea to look
				728	# for tag/attrib/text attributes
				729	return isinstance(element, _ElementInterface) or hasattr(element, "tag")
				730
				731	##
				732	# Writes an element tree or element structure to sys.stdout. This
				733	# function should be used for debugging only.
				734	# <p>
				735	# The exact output format is implementation dependent. In this
				736	# version, it's written as an ordinary XML file.
				737	#
				738	# @param elem An element tree or an individual element.
				739
				740	def dump(elem):
				741	# debugging
				742	if not isinstance(elem, ElementTree):
				743	elem = ElementTree(elem)
				744	elem.write(sys.stdout)
				745	tail = elem.getroot().tail
				746	if not tail or tail[-1] != "\n":
				747	sys.stdout.write("\n")
				748
				749	def _encode(s, encoding):
				750	try:
				751	return s.encode(encoding)
				752	except AttributeError:
				753	return s # 1.5.2: assume the string uses the right encoding
				754
				755	if sys.version[:3] == "1.5":
				756	_escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
				757	else:
				758	_escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
				759
				760	_escape_map = {
				761	"&": "&",
				762	"<": "<",
				763	">": ">",
				764	'"': """,
				765	}
				766
				767	_namespace_map = {
				768	# "well-known" namespace prefixes
				769	"http://www.w3.org/XML/1998/namespace": "xml",
				770	"http://www.w3.org/1999/xhtml": "html",
				771	"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
				772	"http://schemas.xmlsoap.org/wsdl/": "wsdl",
				773	}
				774
				775	def _raise_serialization_error(text):
				776	raise TypeError(
				777	"cannot serialize %r (type %s)" % (text, type(text).__name__)
				778	)
				779
				780	def _encode_entity(text, pattern=_escape):
				781	# map reserved and non-ascii characters to numerical entities
				782	def escape_entities(m, map=_escape_map):
				783	out = []
				784	append = out.append
				785	for char in m.group():
				786	text = map.get(char)
				787	if text is None:
				788	text = "&#%d;" % ord(char)
				789	append(text)
				790	return string.join(out, "")
				791	try:
				792	return _encode(pattern.sub(escape_entities, text), "ascii")
				793	except TypeError:
				794	_raise_serialization_error(text)
				795
				796	#
				797	# the following functions assume an ascii-compatible encoding
				798	# (or "utf-16")
				799
				800	def _escape_cdata(text, encoding=None, replace=string.replace):
				801	# escape character data
				802	try:
				803	if encoding:
				804	try:
				805	text = _encode(text, encoding)
				806	except UnicodeError:
				807	return _encode_entity(text)
				808	text = replace(text, "&", "&")
				809	text = replace(text, "<", "<")
				810	text = replace(text, ">", ">")
				811	return text
				812	except (TypeError, AttributeError):
				813	_raise_serialization_error(text)
				814
				815	def _escape_attrib(text, encoding=None, replace=string.replace):
				816	# escape attribute value
				817	try:
				818	if encoding:
				819	try:
				820	text = _encode(text, encoding)
				821	except UnicodeError:
				822	return _encode_entity(text)
				823	text = replace(text, "&", "&")
				824	text = replace(text, "'", "'") # FIXME: overkill
				825	text = replace(text, "\"", """)
				826	text = replace(text, "<", "<")
				827	text = replace(text, ">", ">")
				828	return text
				829	except (TypeError, AttributeError):
				830	_raise_serialization_error(text)
				831
				832	def fixtag(tag, namespaces):
				833	# given a decorated tag (of the form {uri}tag), return prefixed
				834	# tag and namespace declaration, if any
				835	if isinstance(tag, QName):
				836	tag = tag.text
				837	namespace_uri, tag = string.split(tag[1:], "}", 1)
				838	prefix = namespaces.get(namespace_uri)
				839	if prefix is None:
				840	prefix = _namespace_map.get(namespace_uri)
				841	if prefix is None:
				842	prefix = "ns%d" % len(namespaces)
				843	namespaces[namespace_uri] = prefix
				844	if prefix == "xml":
				845	xmlns = None
				846	else:
				847	xmlns = ("xmlns:%s" % prefix, namespace_uri)
				848	else:
				849	xmlns = None
				850	return "%s:%s" % (prefix, tag), xmlns
				851
				852	##
				853	# Parses an XML document into an element tree.
				854	#
				855	# @param source A filename or file object containing XML data.
				856	# @param parser An optional parser instance. If not given, the
				857	# standard {@link XMLTreeBuilder} parser is used.
				858	# @return An ElementTree instance
				859
				860	def parse(source, parser=None):
				861	tree = ElementTree()
				862	tree.parse(source, parser)
				863	return tree
				864
				865	##
				866	# Parses an XML document into an element tree incrementally, and reports
				867	# what's going on to the user.
				868	#
				869	# @param source A filename or file object containing XML data.
				870	# @param events A list of events to report back. If omitted, only "end"
				871	# events are reported.
				872	# @return A (event, elem) iterator.
				873
				874	class iterparse:
				875
				876	def __init__(self, source, events=None):
				877	if not hasattr(source, "read"):
				878	source = open(source, "rb")
				879	self._file = source
				880	self._events = []
				881	self._index = 0
				882	self.root = self._root = None
				883	self._parser = XMLTreeBuilder()
				884	# wire up the parser for event reporting
				885	parser = self._parser._parser
				886	append = self._events.append
				887	if events is None:
				888	events = ["end"]
				889	for event in events:
				890	if event == "start":
				891	try:
				892	parser.ordered_attributes = 1
				893	parser.specified_attributes = 1
				894	def handler(tag, attrib_in, event=event, append=append,
				895	start=self._parser._start_list):
				896	append((event, start(tag, attrib_in)))
				897	parser.StartElementHandler = handler
				898	except AttributeError:
				899	def handler(tag, attrib_in, event=event, append=append,
				900	start=self._parser._start):
				901	append((event, start(tag, attrib_in)))
				902	parser.StartElementHandler = handler
				903	elif event == "end":
				904	def handler(tag, event=event, append=append,
				905	end=self._parser._end):
				906	append((event, end(tag)))
				907	parser.EndElementHandler = handler
				908	elif event == "start-ns":
				909	def handler(prefix, uri, event=event, append=append):
				910	try:
				911	uri = _encode(uri, "ascii")
				912	except UnicodeError:
				913	pass
				914	append((event, (prefix or "", uri)))
				915	parser.StartNamespaceDeclHandler = handler
				916	elif event == "end-ns":
				917	def handler(prefix, event=event, append=append):
				918	append((event, None))
				919	parser.EndNamespaceDeclHandler = handler
				920
				921	def next(self):
				922	while 1:
				923	try:
				924	item = self._events[self._index]
				925	except IndexError:
				926	if self._parser is None:
				927	self.root = self._root
				928	try:
				929	raise StopIteration
				930	except NameError:
				931	raise IndexError
				932	# load event buffer
				933	del self._events[:]
				934	self._index = 0
				935	data = self._file.read(16384)
				936	if data:
				937	self._parser.feed(data)
				938	else:
				939	self._root = self._parser.close()
				940	self._parser = None
				941	else:
				942	self._index = self._index + 1
				943	return item
				944
				945	try:
				946	iter
				947	def __iter__(self):
				948	return self
				949	except NameError:
				950	def __getitem__(self, index):
				951	return self.next()
				952
				953	##
				954	# Parses an XML document from a string constant. This function can
				955	# be used to embed "XML literals" in Python code.
				956	#
				957	# @param source A string containing XML data.
				958	# @return An Element instance.
				959	# @defreturn Element
				960
				961	def XML(text):
				962	parser = XMLTreeBuilder()
				963	parser.feed(text)
				964	return parser.close()
				965
				966	##
				967	# Parses an XML document from a string constant, and also returns
				968	# a dictionary which maps from element id:s to elements.
				969	#
				970	# @param source A string containing XML data.
				971	# @return A tuple containing an Element instance and a dictionary.
				972	# @defreturn (Element, dictionary)
				973
				974	def XMLID(text):
				975	parser = XMLTreeBuilder()
				976	parser.feed(text)
				977	tree = parser.close()
				978	ids = {}
				979	for elem in tree.getiterator():
				980	id = elem.get("id")
				981	if id:
				982	ids[id] = elem
				983	return tree, ids
				984
				985	##
				986	# Parses an XML document from a string constant. Same as {@link #XML}.
				987	#
				988	# @def fromstring(text)
				989	# @param source A string containing XML data.
				990	# @return An Element instance.
				991	# @defreturn Element
				992
				993	fromstring = XML
				994
				995	##
				996	# Generates a string representation of an XML element, including all
				997	# subelements.
				998	#
				999	# @param element An Element instance.
				1000	# @return An encoded string containing the XML data.
				1001	# @defreturn string
				1002
				1003	def tostring(element, encoding=None):
				1004	class dummy:
				1005	pass
				1006	data = []
				1007	file = dummy()
				1008	file.write = data.append
				1009	ElementTree(element).write(file, encoding)
				1010	return string.join(data, "")
				1011
				1012	##
				1013	# Generic element structure builder. This builder converts a sequence
				1014	# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
				1015	# #TreeBuilder.end} method calls to a well-formed element structure.
				1016	# <p>
				1017	# You can use this class to build an element structure using a custom XML
				1018	# parser, or a parser for some other XML-like format.
				1019	#
				1020	# @param element_factory Optional element factory. This factory
				1021	# is called to create new Element instances, as necessary.
				1022
				1023	class TreeBuilder:
				1024
				1025	def __init__(self, element_factory=None):
				1026	self._data = [] # data collector
				1027	self._elem = [] # element stack
				1028	self._last = None # last element
				1029	self._tail = None # true if we're after an end tag
				1030	if element_factory is None:
				1031	element_factory = _ElementInterface
				1032	self._factory = element_factory
				1033
				1034	##
				1035	# Flushes the parser buffers, and returns the toplevel documen
				1036	# element.
				1037	#
				1038	# @return An Element instance.
				1039	# @defreturn Element
				1040
				1041	def close(self):
				1042	assert len(self._elem) == 0, "missing end tags"
				1043	assert self._last != None, "missing toplevel element"
				1044	return self._last
				1045
				1046	def _flush(self):
				1047	if self._data:
				1048	if self._last is not None:
				1049	text = string.join(self._data, "")
				1050	if self._tail:
				1051	assert self._last.tail is None, "internal error (tail)"
				1052	self._last.tail = text
				1053	else:
				1054	assert self._last.text is None, "internal error (text)"
				1055	self._last.text = text
				1056	self._data = []
				1057
				1058	##
				1059	# Adds text to the current element.
				1060	#
				1061	# @param data A string. This should be either an 8-bit string
				1062	# containing ASCII text, or a Unicode string.
				1063
				1064	def data(self, data):
				1065	self._data.append(data)
				1066
				1067	##
				1068	# Opens a new element.
				1069	#
				1070	# @param tag The element name.
				1071	# @param attrib A dictionary containing element attributes.
				1072	# @return The opened element.
				1073	# @defreturn Element
				1074
				1075	def start(self, tag, attrs):
				1076	self._flush()
				1077	self._last = elem = self._factory(tag, attrs)
				1078	if self._elem:
				1079	self._elem[-1].append(elem)
				1080	self._elem.append(elem)
				1081	self._tail = 0
				1082	return elem
				1083
				1084	##
				1085	# Closes the current element.
				1086	#
				1087	# @param tag The element name.
				1088	# @return The closed element.
				1089	# @defreturn Element
				1090
				1091	def end(self, tag):
				1092	self._flush()
				1093	self._last = self._elem.pop()
				1094	assert self._last.tag == tag,\
				1095	"end tag mismatch (expected %s, got %s)" % (
				1096	self._last.tag, tag)
				1097	self._tail = 1
				1098	return self._last
				1099
				1100	##
				1101	# Element structure builder for XML source data, based on the
				1102	# <b>expat</b> parser.
				1103	#
				1104	# @keyparam target Target object. If omitted, the builder uses an
				1105	# instance of the standard {@link #TreeBuilder} class.
				1106	# @keyparam html Predefine HTML entities. This flag is not supported
				1107	# by the current implementation.
				1108	# @see #ElementTree
				1109	# @see #TreeBuilder
				1110
				1111	class XMLTreeBuilder:
				1112
				1113	def __init__(self, html=0, target=None):
				1114	try:
Fred Drake	fbdeaad	2006-07-29 16:56:15 +0000	[diff] [blame]	1115	from xml.parsers import expat
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	1116	except ImportError:
				1117	raise ImportError(
				1118	"No module named expat; use SimpleXMLTreeBuilder instead"
				1119	)
				1120	self._parser = parser = expat.ParserCreate(None, "}")
				1121	if target is None:
				1122	target = TreeBuilder()
				1123	self._target = target
				1124	self._names = {} # name memo cache
				1125	# callbacks
				1126	parser.DefaultHandlerExpand = self._default
				1127	parser.StartElementHandler = self._start
				1128	parser.EndElementHandler = self._end
				1129	parser.CharacterDataHandler = self._data
				1130	# let expat do the buffering, if supported
				1131	try:
				1132	self._parser.buffer_text = 1
				1133	except AttributeError:
				1134	pass
				1135	# use new-style attribute handling, if supported
				1136	try:
				1137	self._parser.ordered_attributes = 1
				1138	self._parser.specified_attributes = 1
				1139	parser.StartElementHandler = self._start_list
				1140	except AttributeError:
				1141	pass
				1142	encoding = None
				1143	if not parser.returns_unicode:
				1144	encoding = "utf-8"
				1145	# target.xml(encoding, None)
				1146	self._doctype = None
				1147	self.entity = {}
				1148
				1149	def _fixtext(self, text):
				1150	# convert text string to ascii, if possible
				1151	try:
				1152	return _encode(text, "ascii")
				1153	except UnicodeError:
				1154	return text
				1155
				1156	def _fixname(self, key):
				1157	# expand qname, and convert name string to ascii, if possible
				1158	try:
				1159	name = self._names[key]
				1160	except KeyError:
				1161	name = key
				1162	if "}" in name:
				1163	name = "{" + name
				1164	self._names[key] = name = self._fixtext(name)
				1165	return name
				1166
				1167	def _start(self, tag, attrib_in):
				1168	fixname = self._fixname
				1169	tag = fixname(tag)
				1170	attrib = {}
				1171	for key, value in attrib_in.items():
				1172	attrib[fixname(key)] = self._fixtext(value)
				1173	return self._target.start(tag, attrib)
				1174
				1175	def _start_list(self, tag, attrib_in):
				1176	fixname = self._fixname
				1177	tag = fixname(tag)
				1178	attrib = {}
				1179	if attrib_in:
				1180	for i in range(0, len(attrib_in), 2):
				1181	attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
				1182	return self._target.start(tag, attrib)
				1183
				1184	def _data(self, text):
				1185	return self._target.data(self._fixtext(text))
				1186
				1187	def _end(self, tag):
				1188	return self._target.end(self._fixname(tag))
				1189
				1190	def _default(self, text):
				1191	prefix = text[:1]
				1192	if prefix == "&":
				1193	# deal with undefined entities
				1194	try:
				1195	self._target.data(self.entity[text[1:-1]])
				1196	except KeyError:
Fred Drake	fbdeaad	2006-07-29 16:56:15 +0000	[diff] [blame]	1197	from xml.parsers import expat
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	1198	raise expat.error(
				1199	"undefined entity %s: line %d, column %d" %
				1200	(text, self._parser.ErrorLineNumber,
				1201	self._parser.ErrorColumnNumber)
				1202	)
				1203	elif prefix == "<" and text[:9] == "<!DOCTYPE":
				1204	self._doctype = [] # inside a doctype declaration
				1205	elif self._doctype is not None:
				1206	# parse doctype contents
				1207	if prefix == ">":
				1208	self._doctype = None
				1209	return
				1210	text = string.strip(text)
				1211	if not text:
				1212	return
				1213	self._doctype.append(text)
				1214	n = len(self._doctype)
				1215	if n > 2:
				1216	type = self._doctype[1]
				1217	if type == "PUBLIC" and n == 4:
				1218	name, type, pubid, system = self._doctype
				1219	elif type == "SYSTEM" and n == 3:
				1220	name, type, system = self._doctype
				1221	pubid = None
				1222	else:
				1223	return
				1224	if pubid:
				1225	pubid = pubid[1:-1]
				1226	self.doctype(name, pubid, system[1:-1])
				1227	self._doctype = None
				1228
				1229	##
				1230	# Handles a doctype declaration.
				1231	#
				1232	# @param name Doctype name.
				1233	# @param pubid Public identifier.
				1234	# @param system System identifier.
				1235
				1236	def doctype(self, name, pubid, system):
				1237	pass
				1238
				1239	##
				1240	# Feeds data to the parser.
				1241	#
				1242	# @param data Encoded data.
				1243
				1244	def feed(self, data):
				1245	self._parser.Parse(data, 0)
				1246
				1247	##
				1248	# Finishes feeding data to the parser.
				1249	#
				1250	# @return An element structure.
				1251	# @defreturn Element
				1252
				1253	def close(self):
				1254	self._parser.Parse("", 1) # end of data
				1255	tree = self._target.close()
				1256	del self._target, self._parser # get rid of circular references
				1257	return tree
Fredrik Lundh	bf84e54	2006-07-06 12:29:24 +0000	[diff] [blame]	1258
				1259	# compatibility
				1260	XMLParser = XMLTreeBuilder