Blame - Lib/xml/etree/ElementTree.py - platform/external/python/cpython3

blob: deaed7d27be1609790b660436ce3fbcc5a137397 [file] [log] [blame]

Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	1	#
				2	# ElementTree
				3	# $Id: ElementTree.py 2326 2005-03-17 07:45:21Z fredrik $
				4	#
				5	# light-weight XML support for Python 1.5.2 and later.
				6	#
				7	# history:
				8	# 2001-10-20 fl created (from various sources)
				9	# 2001-11-01 fl return root from parse method
				10	# 2002-02-16 fl sort attributes in lexical order
				11	# 2002-04-06 fl TreeBuilder refactoring, added PythonDoc markup
				12	# 2002-05-01 fl finished TreeBuilder refactoring
				13	# 2002-07-14 fl added basic namespace support to ElementTree.write
				14	# 2002-07-25 fl added QName attribute support
				15	# 2002-10-20 fl fixed encoding in write
				16	# 2002-11-24 fl changed default encoding to ascii; fixed attribute encoding
				17	# 2002-11-27 fl accept file objects or file names for parse/write
				18	# 2002-12-04 fl moved XMLTreeBuilder back to this module
				19	# 2003-01-11 fl fixed entity encoding glitch for us-ascii
				20	# 2003-02-13 fl added XML literal factory
				21	# 2003-02-21 fl added ProcessingInstruction/PI factory
				22	# 2003-05-11 fl added tostring/fromstring helpers
				23	# 2003-05-26 fl added ElementPath support
				24	# 2003-07-05 fl added makeelement factory method
				25	# 2003-07-28 fl added more well-known namespace prefixes
				26	# 2003-08-15 fl fixed typo in ElementTree.findtext (Thomas Dartsch)
				27	# 2003-09-04 fl fall back on emulator if ElementPath is not installed
				28	# 2003-10-31 fl markup updates
				29	# 2003-11-15 fl fixed nested namespace bug
				30	# 2004-03-28 fl added XMLID helper
				31	# 2004-06-02 fl added default support to findtext
				32	# 2004-06-08 fl fixed encoding of non-ascii element/attribute names
				33	# 2004-08-23 fl take advantage of post-2.1 expat features
				34	# 2005-02-01 fl added iterparse implementation
				35	# 2005-03-02 fl fixed iterparse support for pre-2.2 versions
				36	#
				37	# Copyright (c) 1999-2005 by Fredrik Lundh. All rights reserved.
				38	#
				39	# fredrik@pythonware.com
				40	# http://www.pythonware.com
				41	#
				42	# --------------------------------------------------------------------
				43	# The ElementTree toolkit is
				44	#
				45	# Copyright (c) 1999-2005 by Fredrik Lundh
				46	#
				47	# By obtaining, using, and/or copying this software and/or its
				48	# associated documentation, you agree that you have read, understood,
				49	# and will comply with the following terms and conditions:
				50	#
				51	# Permission to use, copy, modify, and distribute this software and
				52	# its associated documentation for any purpose and without fee is
				53	# hereby granted, provided that the above copyright notice appears in
				54	# all copies, and that both that copyright notice and this permission
				55	# notice appear in supporting documentation, and that the name of
				56	# Secret Labs AB or the author not be used in advertising or publicity
				57	# pertaining to distribution of the software without specific, written
				58	# prior permission.
				59	#
				60	# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
				61	# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
				62	# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
				63	# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
				64	# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
				65	# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
				66	# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
				67	# OF THIS SOFTWARE.
				68	# --------------------------------------------------------------------
				69
Fredrik Lundh	63168a5	2005-12-14 22:29:34 +0000	[diff] [blame]	70	# Licensed to PSF under a Contributor Agreement.
				71	# See http://www.python.org/2.4/license for licensing details.
				72
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	73	__all__ = [
				74	# public symbols
				75	"Comment",
				76	"dump",
				77	"Element", "ElementTree",
				78	"fromstring",
				79	"iselement", "iterparse",
				80	"parse",
				81	"PI", "ProcessingInstruction",
				82	"QName",
				83	"SubElement",
				84	"tostring",
				85	"TreeBuilder",
				86	"VERSION", "XML",
Thomas Wouters	0e3f591	2006-08-11 14:57:12 +0000	[diff] [blame]	87	"XMLParser", "XMLTreeBuilder",
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	88	]
				89
				90	##
				91	# The <b>Element</b> type is a flexible container object, designed to
				92	# store hierarchical data structures in memory. The type can be
				93	# described as a cross between a list and a dictionary.
				94	# <p>
				95	# Each element has a number of properties associated with it:
				96	# <ul>
				97	# <li>a <i>tag</i>. This is a string identifying what kind of data
				98	# this element represents (the element type, in other words).</li>
				99	# <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
				100	# <li>a <i>text</i> string.</li>
				101	# <li>an optional <i>tail</i> string.</li>
				102	# <li>a number of <i>child elements</i>, stored in a Python sequence</li>
				103	# </ul>
				104	#
				105	# To create an element instance, use the {@link #Element} or {@link
				106	# #SubElement} factory functions.
				107	# <p>
				108	# The {@link #ElementTree} class can be used to wrap an element
				109	# structure, and convert it from and to XML.
				110	##
				111
				112	import string, sys, re
				113
				114	class _SimpleElementPath:
				115	# emulate pre-1.2 find/findtext/findall behaviour
				116	def find(self, element, tag):
				117	for elem in element:
				118	if elem.tag == tag:
				119	return elem
				120	return None
				121	def findtext(self, element, tag, default=None):
				122	for elem in element:
				123	if elem.tag == tag:
				124	return elem.text or ""
				125	return default
				126	def findall(self, element, tag):
				127	if tag[:3] == ".//":
				128	return element.getiterator(tag[3:])
				129	result = []
				130	for elem in element:
				131	if elem.tag == tag:
				132	result.append(elem)
				133	return result
				134
				135	try:
Alex Martelli	c5c45ba	2006-08-21 20:54:38 +0000	[diff] [blame]	136	from . import ElementPath
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	137	except ImportError:
				138	# FIXME: issue warning in this case?
Alex Martelli	c5c45ba	2006-08-21 20:54:38 +0000	[diff] [blame]	139	# TODO: DEFINITELY issue warning here!!!
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	140	ElementPath = _SimpleElementPath()
				141
				142	# TODO: add support for custom namespace resolvers/default namespaces
				143	# TODO: add improved support for incremental parsing
				144
				145	VERSION = "1.2.6"
				146
				147	##
				148	# Internal element class. This class defines the Element interface,
				149	# and provides a reference implementation of this interface.
				150	# <p>
				151	# You should not create instances of this class directly. Use the
				152	# appropriate factory functions instead, such as {@link #Element}
				153	# and {@link #SubElement}.
				154	#
				155	# @see Element
				156	# @see SubElement
				157	# @see Comment
				158	# @see ProcessingInstruction
				159
				160	class _ElementInterface:
				161	# <tag attrib>text<child/>...</tag>tail
				162
				163	##
				164	# (Attribute) Element tag.
				165
				166	tag = None
				167
				168	##
				169	# (Attribute) Element attribute dictionary. Where possible, use
				170	# {@link #_ElementInterface.get},
				171	# {@link #_ElementInterface.set},
				172	# {@link #_ElementInterface.keys}, and
				173	# {@link #_ElementInterface.items} to access
				174	# element attributes.
				175
				176	attrib = None
				177
				178	##
				179	# (Attribute) Text before first subelement. This is either a
				180	# string or the value None, if there was no text.
				181
				182	text = None
				183
				184	##
				185	# (Attribute) Text after this element's end tag, but before the
				186	# next sibling element's start tag. This is either a string or
				187	# the value None, if there was no text.
				188
				189	tail = None # text after end tag, if any
				190
				191	def __init__(self, tag, attrib):
				192	self.tag = tag
				193	self.attrib = attrib
				194	self._children = []
				195
				196	def __repr__(self):
				197	return "<Element %s at %x>" % (self.tag, id(self))
				198
				199	##
				200	# Creates a new element object of the same type as this element.
				201	#
				202	# @param tag Element tag.
				203	# @param attrib Element attributes, given as a dictionary.
				204	# @return A new element instance.
				205
				206	def makeelement(self, tag, attrib):
				207	return Element(tag, attrib)
				208
				209	##
				210	# Returns the number of subelements.
				211	#
				212	# @return The number of subelements.
				213
				214	def __len__(self):
				215	return len(self._children)
				216
				217	##
				218	# Returns the given subelement.
				219	#
				220	# @param index What subelement to return.
				221	# @return The given subelement.
				222	# @exception IndexError If the given element does not exist.
				223
				224	def __getitem__(self, index):
				225	return self._children[index]
				226
				227	##
				228	# Replaces the given subelement.
				229	#
				230	# @param index What subelement to replace.
				231	# @param element The new element value.
				232	# @exception IndexError If the given element does not exist.
				233	# @exception AssertionError If element is not a valid object.
				234
				235	def __setitem__(self, index, element):
				236	assert iselement(element)
				237	self._children[index] = element
				238
				239	##
				240	# Deletes the given subelement.
				241	#
				242	# @param index What subelement to delete.
				243	# @exception IndexError If the given element does not exist.
				244
				245	def __delitem__(self, index):
				246	del self._children[index]
				247
				248	##
				249	# Returns a list containing subelements in the given range.
				250	#
				251	# @param start The first subelement to return.
				252	# @param stop The first subelement that shouldn't be returned.
				253	# @return A sequence object containing subelements.
				254
				255	def __getslice__(self, start, stop):
				256	return self._children[start:stop]
				257
				258	##
				259	# Replaces a number of subelements with elements from a sequence.
				260	#
				261	# @param start The first subelement to replace.
				262	# @param stop The first subelement that shouldn't be replaced.
				263	# @param elements A sequence object with zero or more elements.
				264	# @exception AssertionError If a sequence member is not a valid object.
				265
				266	def __setslice__(self, start, stop, elements):
				267	for element in elements:
				268	assert iselement(element)
				269	self._children[start:stop] = list(elements)
				270
				271	##
				272	# Deletes a number of subelements.
				273	#
				274	# @param start The first subelement to delete.
				275	# @param stop The first subelement to leave in there.
				276
				277	def __delslice__(self, start, stop):
				278	del self._children[start:stop]
				279
				280	##
				281	# Adds a subelement to the end of this element.
				282	#
				283	# @param element The element to add.
				284	# @exception AssertionError If a sequence member is not a valid object.
				285
				286	def append(self, element):
				287	assert iselement(element)
				288	self._children.append(element)
				289
				290	##
				291	# Inserts a subelement at the given position in this element.
				292	#
				293	# @param index Where to insert the new subelement.
				294	# @exception AssertionError If the element is not a valid object.
				295
				296	def insert(self, index, element):
				297	assert iselement(element)
				298	self._children.insert(index, element)
				299
				300	##
				301	# Removes a matching subelement. Unlike the <b>find</b> methods,
				302	# this method compares elements based on identity, not on tag
				303	# value or contents.
				304	#
				305	# @param element What element to remove.
				306	# @exception ValueError If a matching element could not be found.
				307	# @exception AssertionError If the element is not a valid object.
				308
				309	def remove(self, element):
				310	assert iselement(element)
				311	self._children.remove(element)
				312
				313	##
				314	# Returns all subelements. The elements are returned in document
				315	# order.
				316	#
				317	# @return A list of subelements.
				318	# @defreturn list of Element instances
				319
				320	def getchildren(self):
				321	return self._children
				322
				323	##
				324	# Finds the first matching subelement, by tag name or path.
				325	#
				326	# @param path What element to look for.
				327	# @return The first matching element, or None if no element was found.
				328	# @defreturn Element or None
				329
				330	def find(self, path):
				331	return ElementPath.find(self, path)
				332
				333	##
				334	# Finds text for the first matching subelement, by tag name or path.
				335	#
				336	# @param path What element to look for.
				337	# @param default What to return if the element was not found.
				338	# @return The text content of the first matching element, or the
				339	# default value no element was found. Note that if the element
				340	# has is found, but has no text content, this method returns an
				341	# empty string.
				342	# @defreturn string
				343
				344	def findtext(self, path, default=None):
				345	return ElementPath.findtext(self, path, default)
				346
				347	##
				348	# Finds all matching subelements, by tag name or path.
				349	#
				350	# @param path What element to look for.
				351	# @return A list or iterator containing all matching elements,
				352	# in document order.
				353	# @defreturn list of Element instances
				354
				355	def findall(self, path):
				356	return ElementPath.findall(self, path)
				357
				358	##
				359	# Resets an element. This function removes all subelements, clears
				360	# all attributes, and sets the text and tail attributes to None.
				361
				362	def clear(self):
				363	self.attrib.clear()
				364	self._children = []
				365	self.text = self.tail = None
				366
				367	##
				368	# Gets an element attribute.
				369	#
				370	# @param key What attribute to look for.
				371	# @param default What to return if the attribute was not found.
				372	# @return The attribute value, or the default value, if the
				373	# attribute was not found.
				374	# @defreturn string or None
				375
				376	def get(self, key, default=None):
				377	return self.attrib.get(key, default)
				378
				379	##
				380	# Sets an element attribute.
				381	#
				382	# @param key What attribute to set.
				383	# @param value The attribute value.
				384
				385	def set(self, key, value):
				386	self.attrib[key] = value
				387
				388	##
				389	# Gets a list of attribute names. The names are returned in an
				390	# arbitrary order (just like for an ordinary Python dictionary).
				391	#
				392	# @return A list of element attribute names.
				393	# @defreturn list of strings
				394
				395	def keys(self):
				396	return self.attrib.keys()
				397
				398	##
				399	# Gets element attributes, as a sequence. The attributes are
				400	# returned in an arbitrary order.
				401	#
				402	# @return A list of (name, value) tuples for all attributes.
				403	# @defreturn list of (string, string) tuples
				404
				405	def items(self):
				406	return self.attrib.items()
				407
				408	##
				409	# Creates a tree iterator. The iterator loops over this element
				410	# and all subelements, in document order, and returns all elements
				411	# with a matching tag.
				412	# <p>
				413	# If the tree structure is modified during iteration, the result
				414	# is undefined.
				415	#
				416	# @param tag What tags to look for (default is to return all elements).
				417	# @return A list or iterator containing all the matching elements.
				418	# @defreturn list or iterator
				419
				420	def getiterator(self, tag=None):
				421	nodes = []
				422	if tag == "*":
				423	tag = None
				424	if tag is None or self.tag == tag:
				425	nodes.append(self)
				426	for node in self._children:
				427	nodes.extend(node.getiterator(tag))
				428	return nodes
				429
				430	# compatibility
				431	_Element = _ElementInterface
				432
				433	##
				434	# Element factory. This function returns an object implementing the
				435	# standard Element interface. The exact class or type of that object
				436	# is implementation dependent, but it will always be compatible with
				437	# the {@link #_ElementInterface} class in this module.
				438	# <p>
				439	# The element name, attribute names, and attribute values can be
				440	# either 8-bit ASCII strings or Unicode strings.
				441	#
				442	# @param tag The element name.
				443	# @param attrib An optional dictionary, containing element attributes.
				444	# @param **extra Additional attributes, given as keyword arguments.
				445	# @return An element instance.
				446	# @defreturn Element
				447
				448	def Element(tag, attrib={}, **extra):
				449	attrib = attrib.copy()
				450	attrib.update(extra)
				451	return _ElementInterface(tag, attrib)
				452
				453	##
				454	# Subelement factory. This function creates an element instance, and
				455	# appends it to an existing element.
				456	# <p>
				457	# The element name, attribute names, and attribute values can be
				458	# either 8-bit ASCII strings or Unicode strings.
				459	#
				460	# @param parent The parent element.
				461	# @param tag The subelement name.
				462	# @param attrib An optional dictionary, containing element attributes.
				463	# @param **extra Additional attributes, given as keyword arguments.
				464	# @return An element instance.
				465	# @defreturn Element
				466
				467	def SubElement(parent, tag, attrib={}, **extra):
				468	attrib = attrib.copy()
				469	attrib.update(extra)
				470	element = parent.makeelement(tag, attrib)
				471	parent.append(element)
				472	return element
				473
				474	##
				475	# Comment element factory. This factory function creates a special
				476	# element that will be serialized as an XML comment.
				477	# <p>
				478	# The comment string can be either an 8-bit ASCII string or a Unicode
				479	# string.
				480	#
				481	# @param text A string containing the comment string.
				482	# @return An element instance, representing a comment.
				483	# @defreturn Element
				484
				485	def Comment(text=None):
				486	element = Element(Comment)
				487	element.text = text
				488	return element
				489
				490	##
				491	# PI element factory. This factory function creates a special element
				492	# that will be serialized as an XML processing instruction.
				493	#
				494	# @param target A string containing the PI target.
				495	# @param text A string containing the PI contents, if any.
				496	# @return An element instance, representing a PI.
				497	# @defreturn Element
				498
				499	def ProcessingInstruction(target, text=None):
				500	element = Element(ProcessingInstruction)
				501	element.text = target
				502	if text:
				503	element.text = element.text + " " + text
				504	return element
				505
				506	PI = ProcessingInstruction
				507
				508	##
				509	# QName wrapper. This can be used to wrap a QName attribute value, in
				510	# order to get proper namespace handling on output.
				511	#
				512	# @param text A string containing the QName value, in the form {uri}local,
				513	# or, if the tag argument is given, the URI part of a QName.
				514	# @param tag Optional tag. If given, the first argument is interpreted as
				515	# an URI, and this argument is interpreted as a local name.
				516	# @return An opaque object, representing the QName.
				517
				518	class QName:
				519	def __init__(self, text_or_uri, tag=None):
				520	if tag:
				521	text_or_uri = "{%s}%s" % (text_or_uri, tag)
				522	self.text = text_or_uri
				523	def __str__(self):
				524	return self.text
				525	def __hash__(self):
				526	return hash(self.text)
				527	def __cmp__(self, other):
				528	if isinstance(other, QName):
				529	return cmp(self.text, other.text)
				530	return cmp(self.text, other)
				531
				532	##
				533	# ElementTree wrapper class. This class represents an entire element
				534	# hierarchy, and adds some extra support for serialization to and from
				535	# standard XML.
				536	#
				537	# @param element Optional root element.
				538	# @keyparam file Optional file handle or name. If given, the
				539	# tree is initialized with the contents of this XML file.
				540
				541	class ElementTree:
				542
				543	def __init__(self, element=None, file=None):
				544	assert element is None or iselement(element)
				545	self._root = element # first node
				546	if file:
				547	self.parse(file)
				548
				549	##
				550	# Gets the root element for this tree.
				551	#
				552	# @return An element instance.
				553	# @defreturn Element
				554
				555	def getroot(self):
				556	return self._root
				557
				558	##
				559	# Replaces the root element for this tree. This discards the
				560	# current contents of the tree, and replaces it with the given
				561	# element. Use with care.
				562	#
				563	# @param element An element instance.
				564
				565	def _setroot(self, element):
				566	assert iselement(element)
				567	self._root = element
				568
				569	##
				570	# Loads an external XML document into this element tree.
				571	#
				572	# @param source A file name or file object.
				573	# @param parser An optional parser instance. If not given, the
				574	# standard {@link XMLTreeBuilder} parser is used.
				575	# @return The document root element.
				576	# @defreturn Element
				577
				578	def parse(self, source, parser=None):
				579	if not hasattr(source, "read"):
				580	source = open(source, "rb")
				581	if not parser:
				582	parser = XMLTreeBuilder()
				583	while 1:
				584	data = source.read(32768)
				585	if not data:
				586	break
				587	parser.feed(data)
				588	self._root = parser.close()
				589	return self._root
				590
				591	##
				592	# Creates a tree iterator for the root element. The iterator loops
				593	# over all elements in this tree, in document order.
				594	#
				595	# @param tag What tags to look for (default is to return all elements)
				596	# @return An iterator.
				597	# @defreturn iterator
				598
				599	def getiterator(self, tag=None):
				600	assert self._root is not None
				601	return self._root.getiterator(tag)
				602
				603	##
				604	# Finds the first toplevel element with given tag.
				605	# Same as getroot().find(path).
				606	#
				607	# @param path What element to look for.
				608	# @return The first matching element, or None if no element was found.
				609	# @defreturn Element or None
				610
				611	def find(self, path):
				612	assert self._root is not None
				613	if path[:1] == "/":
				614	path = "." + path
				615	return self._root.find(path)
				616
				617	##
				618	# Finds the element text for the first toplevel element with given
				619	# tag. Same as getroot().findtext(path).
				620	#
				621	# @param path What toplevel element to look for.
				622	# @param default What to return if the element was not found.
				623	# @return The text content of the first matching element, or the
				624	# default value no element was found. Note that if the element
				625	# has is found, but has no text content, this method returns an
				626	# empty string.
				627	# @defreturn string
				628
				629	def findtext(self, path, default=None):
				630	assert self._root is not None
				631	if path[:1] == "/":
				632	path = "." + path
				633	return self._root.findtext(path, default)
				634
				635	##
				636	# Finds all toplevel elements with the given tag.
				637	# Same as getroot().findall(path).
				638	#
				639	# @param path What element to look for.
				640	# @return A list or iterator containing all matching elements,
				641	# in document order.
				642	# @defreturn list of Element instances
				643
				644	def findall(self, path):
				645	assert self._root is not None
				646	if path[:1] == "/":
				647	path = "." + path
				648	return self._root.findall(path)
				649
				650	##
				651	# Writes the element tree to a file, as XML.
				652	#
				653	# @param file A file name, or a file object opened for writing.
				654	# @param encoding Optional output encoding (default is US-ASCII).
				655
				656	def write(self, file, encoding="us-ascii"):
				657	assert self._root is not None
				658	if not hasattr(file, "write"):
				659	file = open(file, "wb")
				660	if not encoding:
				661	encoding = "us-ascii"
				662	elif encoding != "utf-8" and encoding != "us-ascii":
				663	file.write("<?xml version='1.0' encoding='%s'?>\n" % encoding)
				664	self._write(file, self._root, encoding, {})
				665
				666	def _write(self, file, node, encoding, namespaces):
				667	# write XML to file
				668	tag = node.tag
				669	if tag is Comment:
				670	file.write("<!-- %s -->" % _escape_cdata(node.text, encoding))
				671	elif tag is ProcessingInstruction:
				672	file.write("<?%s?>" % _escape_cdata(node.text, encoding))
				673	else:
				674	items = node.items()
				675	xmlns_items = [] # new namespaces in this scope
				676	try:
				677	if isinstance(tag, QName) or tag[:1] == "{":
				678	tag, xmlns = fixtag(tag, namespaces)
				679	if xmlns: xmlns_items.append(xmlns)
				680	except TypeError:
				681	_raise_serialization_error(tag)
				682	file.write("<" + _encode(tag, encoding))
				683	if items or xmlns_items:
				684	items.sort() # lexical order
				685	for k, v in items:
				686	try:
				687	if isinstance(k, QName) or k[:1] == "{":
				688	k, xmlns = fixtag(k, namespaces)
				689	if xmlns: xmlns_items.append(xmlns)
				690	except TypeError:
				691	_raise_serialization_error(k)
				692	try:
				693	if isinstance(v, QName):
				694	v, xmlns = fixtag(v, namespaces)
				695	if xmlns: xmlns_items.append(xmlns)
				696	except TypeError:
				697	_raise_serialization_error(v)
				698	file.write(" %s=\"%s\"" % (_encode(k, encoding),
				699	_escape_attrib(v, encoding)))
				700	for k, v in xmlns_items:
				701	file.write(" %s=\"%s\"" % (_encode(k, encoding),
				702	_escape_attrib(v, encoding)))
				703	if node.text or len(node):
				704	file.write(">")
				705	if node.text:
				706	file.write(_escape_cdata(node.text, encoding))
				707	for n in node:
				708	self._write(file, n, encoding, namespaces)
				709	file.write("</" + _encode(tag, encoding) + ">")
				710	else:
				711	file.write(" />")
				712	for k, v in xmlns_items:
				713	del namespaces[v]
				714	if node.tail:
				715	file.write(_escape_cdata(node.tail, encoding))
				716
				717	# --------------------------------------------------------------------
				718	# helpers
				719
				720	##
				721	# Checks if an object appears to be a valid element object.
				722	#
				723	# @param An element instance.
				724	# @return A true value if this is an element object.
				725	# @defreturn flag
				726
				727	def iselement(element):
				728	# FIXME: not sure about this; might be a better idea to look
				729	# for tag/attrib/text attributes
				730	return isinstance(element, _ElementInterface) or hasattr(element, "tag")
				731
				732	##
				733	# Writes an element tree or element structure to sys.stdout. This
				734	# function should be used for debugging only.
				735	# <p>
				736	# The exact output format is implementation dependent. In this
				737	# version, it's written as an ordinary XML file.
				738	#
				739	# @param elem An element tree or an individual element.
				740
				741	def dump(elem):
				742	# debugging
				743	if not isinstance(elem, ElementTree):
				744	elem = ElementTree(elem)
				745	elem.write(sys.stdout)
				746	tail = elem.getroot().tail
				747	if not tail or tail[-1] != "\n":
				748	sys.stdout.write("\n")
				749
				750	def _encode(s, encoding):
				751	try:
				752	return s.encode(encoding)
				753	except AttributeError:
				754	return s # 1.5.2: assume the string uses the right encoding
				755
				756	if sys.version[:3] == "1.5":
				757	_escape = re.compile(r"[&<>\"\x80-\xff]+") # 1.5.2
				758	else:
				759	_escape = re.compile(eval(r'u"[&<>\"\u0080-\uffff]+"'))
				760
				761	_escape_map = {
				762	"&": "&",
				763	"<": "<",
				764	">": ">",
				765	'"': """,
				766	}
				767
				768	_namespace_map = {
				769	# "well-known" namespace prefixes
				770	"http://www.w3.org/XML/1998/namespace": "xml",
				771	"http://www.w3.org/1999/xhtml": "html",
				772	"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
				773	"http://schemas.xmlsoap.org/wsdl/": "wsdl",
				774	}
				775
				776	def _raise_serialization_error(text):
				777	raise TypeError(
				778	"cannot serialize %r (type %s)" % (text, type(text).__name__)
				779	)
				780
				781	def _encode_entity(text, pattern=_escape):
				782	# map reserved and non-ascii characters to numerical entities
				783	def escape_entities(m, map=_escape_map):
				784	out = []
				785	append = out.append
				786	for char in m.group():
				787	text = map.get(char)
				788	if text is None:
				789	text = "&#%d;" % ord(char)
				790	append(text)
				791	return string.join(out, "")
				792	try:
				793	return _encode(pattern.sub(escape_entities, text), "ascii")
				794	except TypeError:
				795	_raise_serialization_error(text)
				796
				797	#
				798	# the following functions assume an ascii-compatible encoding
				799	# (or "utf-16")
				800
				801	def _escape_cdata(text, encoding=None, replace=string.replace):
				802	# escape character data
				803	try:
				804	if encoding:
				805	try:
				806	text = _encode(text, encoding)
				807	except UnicodeError:
				808	return _encode_entity(text)
				809	text = replace(text, "&", "&")
				810	text = replace(text, "<", "<")
				811	text = replace(text, ">", ">")
				812	return text
				813	except (TypeError, AttributeError):
				814	_raise_serialization_error(text)
				815
				816	def _escape_attrib(text, encoding=None, replace=string.replace):
				817	# escape attribute value
				818	try:
				819	if encoding:
				820	try:
				821	text = _encode(text, encoding)
				822	except UnicodeError:
				823	return _encode_entity(text)
				824	text = replace(text, "&", "&")
				825	text = replace(text, "'", "'") # FIXME: overkill
				826	text = replace(text, "\"", """)
				827	text = replace(text, "<", "<")
				828	text = replace(text, ">", ">")
				829	return text
				830	except (TypeError, AttributeError):
				831	_raise_serialization_error(text)
				832
				833	def fixtag(tag, namespaces):
				834	# given a decorated tag (of the form {uri}tag), return prefixed
				835	# tag and namespace declaration, if any
				836	if isinstance(tag, QName):
				837	tag = tag.text
				838	namespace_uri, tag = string.split(tag[1:], "}", 1)
				839	prefix = namespaces.get(namespace_uri)
				840	if prefix is None:
				841	prefix = _namespace_map.get(namespace_uri)
				842	if prefix is None:
				843	prefix = "ns%d" % len(namespaces)
				844	namespaces[namespace_uri] = prefix
				845	if prefix == "xml":
				846	xmlns = None
				847	else:
				848	xmlns = ("xmlns:%s" % prefix, namespace_uri)
				849	else:
				850	xmlns = None
				851	return "%s:%s" % (prefix, tag), xmlns
				852
				853	##
				854	# Parses an XML document into an element tree.
				855	#
				856	# @param source A filename or file object containing XML data.
				857	# @param parser An optional parser instance. If not given, the
				858	# standard {@link XMLTreeBuilder} parser is used.
				859	# @return An ElementTree instance
				860
				861	def parse(source, parser=None):
				862	tree = ElementTree()
				863	tree.parse(source, parser)
				864	return tree
				865
				866	##
				867	# Parses an XML document into an element tree incrementally, and reports
				868	# what's going on to the user.
				869	#
				870	# @param source A filename or file object containing XML data.
				871	# @param events A list of events to report back. If omitted, only "end"
				872	# events are reported.
				873	# @return A (event, elem) iterator.
				874
				875	class iterparse:
				876
				877	def __init__(self, source, events=None):
				878	if not hasattr(source, "read"):
				879	source = open(source, "rb")
				880	self._file = source
				881	self._events = []
				882	self._index = 0
				883	self.root = self._root = None
				884	self._parser = XMLTreeBuilder()
				885	# wire up the parser for event reporting
				886	parser = self._parser._parser
				887	append = self._events.append
				888	if events is None:
				889	events = ["end"]
				890	for event in events:
				891	if event == "start":
				892	try:
				893	parser.ordered_attributes = 1
				894	parser.specified_attributes = 1
				895	def handler(tag, attrib_in, event=event, append=append,
				896	start=self._parser._start_list):
				897	append((event, start(tag, attrib_in)))
				898	parser.StartElementHandler = handler
				899	except AttributeError:
				900	def handler(tag, attrib_in, event=event, append=append,
				901	start=self._parser._start):
				902	append((event, start(tag, attrib_in)))
				903	parser.StartElementHandler = handler
				904	elif event == "end":
				905	def handler(tag, event=event, append=append,
				906	end=self._parser._end):
				907	append((event, end(tag)))
				908	parser.EndElementHandler = handler
				909	elif event == "start-ns":
				910	def handler(prefix, uri, event=event, append=append):
				911	try:
				912	uri = _encode(uri, "ascii")
				913	except UnicodeError:
				914	pass
				915	append((event, (prefix or "", uri)))
				916	parser.StartNamespaceDeclHandler = handler
				917	elif event == "end-ns":
				918	def handler(prefix, event=event, append=append):
				919	append((event, None))
				920	parser.EndNamespaceDeclHandler = handler
				921
				922	def next(self):
				923	while 1:
				924	try:
				925	item = self._events[self._index]
				926	except IndexError:
				927	if self._parser is None:
				928	self.root = self._root
				929	try:
				930	raise StopIteration
				931	except NameError:
				932	raise IndexError
				933	# load event buffer
				934	del self._events[:]
				935	self._index = 0
				936	data = self._file.read(16384)
				937	if data:
				938	self._parser.feed(data)
				939	else:
				940	self._root = self._parser.close()
				941	self._parser = None
				942	else:
				943	self._index = self._index + 1
				944	return item
				945
				946	try:
				947	iter
				948	def __iter__(self):
				949	return self
				950	except NameError:
				951	def __getitem__(self, index):
				952	return self.next()
				953
				954	##
				955	# Parses an XML document from a string constant. This function can
				956	# be used to embed "XML literals" in Python code.
				957	#
				958	# @param source A string containing XML data.
				959	# @return An Element instance.
				960	# @defreturn Element
				961
				962	def XML(text):
				963	parser = XMLTreeBuilder()
				964	parser.feed(text)
				965	return parser.close()
				966
				967	##
				968	# Parses an XML document from a string constant, and also returns
				969	# a dictionary which maps from element id:s to elements.
				970	#
				971	# @param source A string containing XML data.
				972	# @return A tuple containing an Element instance and a dictionary.
				973	# @defreturn (Element, dictionary)
				974
				975	def XMLID(text):
				976	parser = XMLTreeBuilder()
				977	parser.feed(text)
				978	tree = parser.close()
				979	ids = {}
				980	for elem in tree.getiterator():
				981	id = elem.get("id")
				982	if id:
				983	ids[id] = elem
				984	return tree, ids
				985
				986	##
				987	# Parses an XML document from a string constant. Same as {@link #XML}.
				988	#
				989	# @def fromstring(text)
				990	# @param source A string containing XML data.
				991	# @return An Element instance.
				992	# @defreturn Element
				993
				994	fromstring = XML
				995
				996	##
				997	# Generates a string representation of an XML element, including all
				998	# subelements.
				999	#
				1000	# @param element An Element instance.
				1001	# @return An encoded string containing the XML data.
				1002	# @defreturn string
				1003
				1004	def tostring(element, encoding=None):
				1005	class dummy:
				1006	pass
				1007	data = []
				1008	file = dummy()
				1009	file.write = data.append
				1010	ElementTree(element).write(file, encoding)
				1011	return string.join(data, "")
				1012
				1013	##
				1014	# Generic element structure builder. This builder converts a sequence
				1015	# of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
				1016	# #TreeBuilder.end} method calls to a well-formed element structure.
				1017	# <p>
				1018	# You can use this class to build an element structure using a custom XML
				1019	# parser, or a parser for some other XML-like format.
				1020	#
				1021	# @param element_factory Optional element factory. This factory
				1022	# is called to create new Element instances, as necessary.
				1023
				1024	class TreeBuilder:
				1025
				1026	def __init__(self, element_factory=None):
				1027	self._data = [] # data collector
				1028	self._elem = [] # element stack
				1029	self._last = None # last element
				1030	self._tail = None # true if we're after an end tag
				1031	if element_factory is None:
				1032	element_factory = _ElementInterface
				1033	self._factory = element_factory
				1034
				1035	##
				1036	# Flushes the parser buffers, and returns the toplevel documen
				1037	# element.
				1038	#
				1039	# @return An Element instance.
				1040	# @defreturn Element
				1041
				1042	def close(self):
				1043	assert len(self._elem) == 0, "missing end tags"
				1044	assert self._last != None, "missing toplevel element"
				1045	return self._last
				1046
				1047	def _flush(self):
				1048	if self._data:
				1049	if self._last is not None:
				1050	text = string.join(self._data, "")
				1051	if self._tail:
				1052	assert self._last.tail is None, "internal error (tail)"
				1053	self._last.tail = text
				1054	else:
				1055	assert self._last.text is None, "internal error (text)"
				1056	self._last.text = text
				1057	self._data = []
				1058
				1059	##
				1060	# Adds text to the current element.
				1061	#
				1062	# @param data A string. This should be either an 8-bit string
				1063	# containing ASCII text, or a Unicode string.
				1064
				1065	def data(self, data):
				1066	self._data.append(data)
				1067
				1068	##
				1069	# Opens a new element.
				1070	#
				1071	# @param tag The element name.
				1072	# @param attrib A dictionary containing element attributes.
				1073	# @return The opened element.
				1074	# @defreturn Element
				1075
				1076	def start(self, tag, attrs):
				1077	self._flush()
				1078	self._last = elem = self._factory(tag, attrs)
				1079	if self._elem:
				1080	self._elem[-1].append(elem)
				1081	self._elem.append(elem)
				1082	self._tail = 0
				1083	return elem
				1084
				1085	##
				1086	# Closes the current element.
				1087	#
				1088	# @param tag The element name.
				1089	# @return The closed element.
				1090	# @defreturn Element
				1091
				1092	def end(self, tag):
				1093	self._flush()
				1094	self._last = self._elem.pop()
				1095	assert self._last.tag == tag,\
				1096	"end tag mismatch (expected %s, got %s)" % (
				1097	self._last.tag, tag)
				1098	self._tail = 1
				1099	return self._last
				1100
				1101	##
				1102	# Element structure builder for XML source data, based on the
				1103	# <b>expat</b> parser.
				1104	#
				1105	# @keyparam target Target object. If omitted, the builder uses an
				1106	# instance of the standard {@link #TreeBuilder} class.
				1107	# @keyparam html Predefine HTML entities. This flag is not supported
				1108	# by the current implementation.
				1109	# @see #ElementTree
				1110	# @see #TreeBuilder
				1111
				1112	class XMLTreeBuilder:
				1113
				1114	def __init__(self, html=0, target=None):
				1115	try:
Thomas Wouters	0e3f591	2006-08-11 14:57:12 +0000	[diff] [blame]	1116	from xml.parsers import expat
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	1117	except ImportError:
				1118	raise ImportError(
				1119	"No module named expat; use SimpleXMLTreeBuilder instead"
				1120	)
				1121	self._parser = parser = expat.ParserCreate(None, "}")
				1122	if target is None:
				1123	target = TreeBuilder()
				1124	self._target = target
				1125	self._names = {} # name memo cache
				1126	# callbacks
				1127	parser.DefaultHandlerExpand = self._default
				1128	parser.StartElementHandler = self._start
				1129	parser.EndElementHandler = self._end
				1130	parser.CharacterDataHandler = self._data
				1131	# let expat do the buffering, if supported
				1132	try:
				1133	self._parser.buffer_text = 1
				1134	except AttributeError:
				1135	pass
				1136	# use new-style attribute handling, if supported
				1137	try:
				1138	self._parser.ordered_attributes = 1
				1139	self._parser.specified_attributes = 1
				1140	parser.StartElementHandler = self._start_list
				1141	except AttributeError:
				1142	pass
				1143	encoding = None
				1144	if not parser.returns_unicode:
				1145	encoding = "utf-8"
				1146	# target.xml(encoding, None)
				1147	self._doctype = None
				1148	self.entity = {}
				1149
				1150	def _fixtext(self, text):
				1151	# convert text string to ascii, if possible
				1152	try:
				1153	return _encode(text, "ascii")
				1154	except UnicodeError:
				1155	return text
				1156
				1157	def _fixname(self, key):
				1158	# expand qname, and convert name string to ascii, if possible
				1159	try:
				1160	name = self._names[key]
				1161	except KeyError:
				1162	name = key
				1163	if "}" in name:
				1164	name = "{" + name
				1165	self._names[key] = name = self._fixtext(name)
				1166	return name
				1167
				1168	def _start(self, tag, attrib_in):
				1169	fixname = self._fixname
				1170	tag = fixname(tag)
				1171	attrib = {}
				1172	for key, value in attrib_in.items():
				1173	attrib[fixname(key)] = self._fixtext(value)
				1174	return self._target.start(tag, attrib)
				1175
				1176	def _start_list(self, tag, attrib_in):
				1177	fixname = self._fixname
				1178	tag = fixname(tag)
				1179	attrib = {}
				1180	if attrib_in:
				1181	for i in range(0, len(attrib_in), 2):
				1182	attrib[fixname(attrib_in[i])] = self._fixtext(attrib_in[i+1])
				1183	return self._target.start(tag, attrib)
				1184
				1185	def _data(self, text):
				1186	return self._target.data(self._fixtext(text))
				1187
				1188	def _end(self, tag):
				1189	return self._target.end(self._fixname(tag))
				1190
				1191	def _default(self, text):
				1192	prefix = text[:1]
				1193	if prefix == "&":
				1194	# deal with undefined entities
				1195	try:
				1196	self._target.data(self.entity[text[1:-1]])
				1197	except KeyError:
Thomas Wouters	0e3f591	2006-08-11 14:57:12 +0000	[diff] [blame]	1198	from xml.parsers import expat
Armin Rigo	9ed7306	2005-12-14 18:10:45 +0000	[diff] [blame]	1199	raise expat.error(
				1200	"undefined entity %s: line %d, column %d" %
				1201	(text, self._parser.ErrorLineNumber,
				1202	self._parser.ErrorColumnNumber)
				1203	)
				1204	elif prefix == "<" and text[:9] == "<!DOCTYPE":
				1205	self._doctype = [] # inside a doctype declaration
				1206	elif self._doctype is not None:
				1207	# parse doctype contents
				1208	if prefix == ">":
				1209	self._doctype = None
				1210	return
				1211	text = string.strip(text)
				1212	if not text:
				1213	return
				1214	self._doctype.append(text)
				1215	n = len(self._doctype)
				1216	if n > 2:
				1217	type = self._doctype[1]
				1218	if type == "PUBLIC" and n == 4:
				1219	name, type, pubid, system = self._doctype
				1220	elif type == "SYSTEM" and n == 3:
				1221	name, type, system = self._doctype
				1222	pubid = None
				1223	else:
				1224	return
				1225	if pubid:
				1226	pubid = pubid[1:-1]
				1227	self.doctype(name, pubid, system[1:-1])
				1228	self._doctype = None
				1229
				1230	##
				1231	# Handles a doctype declaration.
				1232	#
				1233	# @param name Doctype name.
				1234	# @param pubid Public identifier.
				1235	# @param system System identifier.
				1236
				1237	def doctype(self, name, pubid, system):
				1238	pass
				1239
				1240	##
				1241	# Feeds data to the parser.
				1242	#
				1243	# @param data Encoded data.
				1244
				1245	def feed(self, data):
				1246	self._parser.Parse(data, 0)
				1247
				1248	##
				1249	# Finishes feeding data to the parser.
				1250	#
				1251	# @return An element structure.
				1252	# @defreturn Element
				1253
				1254	def close(self):
				1255	self._parser.Parse("", 1) # end of data
				1256	tree = self._target.close()
				1257	del self._target, self._parser # get rid of circular references
				1258	return tree
Thomas Wouters	0e3f591	2006-08-11 14:57:12 +0000	[diff] [blame]	1259
				1260	# compatibility
				1261	XMLParser = XMLTreeBuilder