Blame - site_utils/rpm_control_system/BeautifulSoup.py - platform/external/autotest

blob: 7278215ca2a899d72a7eb122e5e52cfeda56b24b [file] [log] [blame]

Simran Basi	e644932	2012-06-15 15:03:49 -0700	[diff] [blame]	1	"""Beautiful Soup
				2	Elixir and Tonic
				3	"The Screen-Scraper's Friend"
				4	http://www.crummy.com/software/BeautifulSoup/
				5
				6	Beautiful Soup parses a (possibly invalid) XML or HTML document into a
				7	tree representation. It provides methods and Pythonic idioms that make
				8	it easy to navigate, search, and modify the tree.
				9
				10	A well-formed XML/HTML document yields a well-formed data
				11	structure. An ill-formed XML/HTML document yields a correspondingly
				12	ill-formed data structure. If your document is only locally
				13	well-formed, you can use this library to find and process the
				14	well-formed part of it.
				15
				16	Beautiful Soup works with Python 2.2 and up. It has no external
				17	dependencies, but you'll have more success at converting data to UTF-8
				18	if you also install these three packages:
				19
				20	* chardet, for auto-detecting character encodings
				21	http://chardet.feedparser.org/
				22	* cjkcodecs and iconv_codec, which add more encodings to the ones supported
				23	by stock Python.
				24	http://cjkpython.i18n.org/
				25
				26	Beautiful Soup defines classes for two main parsing strategies:
				27
				28	* BeautifulStoneSoup, for parsing XML, SGML, or your domain-specific
				29	language that kind of looks like XML.
				30
				31	* BeautifulSoup, for parsing run-of-the-mill HTML code, be it valid
				32	or invalid. This class has web browser-like heuristics for
				33	obtaining a sensible parse tree in the face of common HTML errors.
				34
				35	Beautiful Soup also defines a class (UnicodeDammit) for autodetecting
				36	the encoding of an HTML or XML document, and converting it to
				37	Unicode. Much of this code is taken from Mark Pilgrim's Universal Feed Parser.
				38
				39	For more than you ever wanted to know about Beautiful Soup, see the
				40	documentation:
				41	http://www.crummy.com/software/BeautifulSoup/documentation.html
				42
				43	Here, have some legalese:
				44
				45	Copyright (c) 2004-2010, Leonard Richardson
				46
				47	All rights reserved.
				48
				49	Redistribution and use in source and binary forms, with or without
				50	modification, are permitted provided that the following conditions are
				51	met:
				52
				53	* Redistributions of source code must retain the above copyright
				54	notice, this list of conditions and the following disclaimer.
				55
				56	* Redistributions in binary form must reproduce the above
				57	copyright notice, this list of conditions and the following
				58	disclaimer in the documentation and/or other materials provided
				59	with the distribution.
				60
				61	* Neither the name of the the Beautiful Soup Consortium and All
				62	Night Kosher Bakery nor the names of its contributors may be
				63	used to endorse or promote products derived from this software
				64	without specific prior written permission.
				65
				66	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
				67	"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
				68	LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
				69	A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
				70	CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
				71	EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
				72	PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
				73	PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
				74	LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
				75	NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
				76	SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE, DAMMIT.
				77
				78	"""
				79	from __future__ import generators
				80
				81	__author__ = "Leonard Richardson (leonardr@segfault.org)"
				82	__version__ = "3.2.1"
				83	__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
				84	__license__ = "New-style BSD"
				85
				86	from sgmllib import SGMLParser, SGMLParseError
				87	import codecs
				88	import markupbase
				89	import types
				90	import re
				91	import sgmllib
				92	try:
				93	from htmlentitydefs import name2codepoint
				94	except ImportError:
				95	name2codepoint = {}
				96	try:
				97	set
				98	except NameError:
				99	from sets import Set as set
				100
				101	#These hacks make Beautiful Soup able to parse XML with namespaces
				102	sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*')
				103	markupbase._declname_match = re.compile(r'[a-zA-Z][-_.:a-zA-Z0-9]\s').match
				104
				105	DEFAULT_OUTPUT_ENCODING = "utf-8"
				106
				107	def _match_css_class(str):
				108	"""Build a RE to match the given CSS class."""
				109	return re.compile(r"(^\|.*\s)%s($\|\s)" % str)
				110
				111	# First, the classes that represent markup elements.
				112
				113	class PageElement(object):
				114	"""Contains the navigational information for some part of the page
				115	(either a tag or a piece of text)"""
				116
				117	def _invert(h):
				118	"Cheap function to invert a hash."
				119	i = {}
				120	for k,v in h.items():
				121	i[v] = k
				122	return i
				123
				124	XML_ENTITIES_TO_SPECIAL_CHARS = { "apos" : "'",
				125	"quot" : '"',
				126	"amp" : "&",
				127	"lt" : "<",
				128	"gt" : ">" }
				129
				130	XML_SPECIAL_CHARS_TO_ENTITIES = _invert(XML_ENTITIES_TO_SPECIAL_CHARS)
				131
				132	def setup(self, parent=None, previous=None):
				133	"""Sets up the initial relations between this element and
				134	other elements."""
				135	self.parent = parent
				136	self.previous = previous
				137	self.next = None
				138	self.previousSibling = None
				139	self.nextSibling = None
				140	if self.parent and self.parent.contents:
				141	self.previousSibling = self.parent.contents[-1]
				142	self.previousSibling.nextSibling = self
				143
				144	def replaceWith(self, replaceWith):
				145	oldParent = self.parent
				146	myIndex = self.parent.index(self)
				147	if hasattr(replaceWith, "parent")\
				148	and replaceWith.parent is self.parent:
				149	# We're replacing this element with one of its siblings.
				150	index = replaceWith.parent.index(replaceWith)
				151	if index and index < myIndex:
				152	# Furthermore, it comes before this element. That
				153	# means that when we extract it, the index of this
				154	# element will change.
				155	myIndex = myIndex - 1
				156	self.extract()
				157	oldParent.insert(myIndex, replaceWith)
				158
				159	def replaceWithChildren(self):
				160	myParent = self.parent
				161	myIndex = self.parent.index(self)
				162	self.extract()
				163	reversedChildren = list(self.contents)
				164	reversedChildren.reverse()
				165	for child in reversedChildren:
				166	myParent.insert(myIndex, child)
				167
				168	def extract(self):
				169	"""Destructively rips this element out of the tree."""
				170	if self.parent:
				171	try:
				172	del self.parent.contents[self.parent.index(self)]
				173	except ValueError:
				174	pass
				175
				176	#Find the two elements that would be next to each other if
				177	#this element (and any children) hadn't been parsed. Connect
				178	#the two.
				179	lastChild = self._lastRecursiveChild()
				180	nextElement = lastChild.next
				181
				182	if self.previous:
				183	self.previous.next = nextElement
				184	if nextElement:
				185	nextElement.previous = self.previous
				186	self.previous = None
				187	lastChild.next = None
				188
				189	self.parent = None
				190	if self.previousSibling:
				191	self.previousSibling.nextSibling = self.nextSibling
				192	if self.nextSibling:
				193	self.nextSibling.previousSibling = self.previousSibling
				194	self.previousSibling = self.nextSibling = None
				195	return self
				196
				197	def _lastRecursiveChild(self):
				198	"Finds the last element beneath this object to be parsed."
				199	lastChild = self
				200	while hasattr(lastChild, 'contents') and lastChild.contents:
				201	lastChild = lastChild.contents[-1]
				202	return lastChild
				203
				204	def insert(self, position, newChild):
				205	if isinstance(newChild, basestring) \
				206	and not isinstance(newChild, NavigableString):
				207	newChild = NavigableString(newChild)
				208
				209	position = min(position, len(self.contents))
				210	if hasattr(newChild, 'parent') and newChild.parent is not None:
				211	# We're 'inserting' an element that's already one
				212	# of this object's children.
				213	if newChild.parent is self:
				214	index = self.index(newChild)
				215	if index > position:
				216	# Furthermore we're moving it further down the
				217	# list of this object's children. That means that
				218	# when we extract this element, our target index
				219	# will jump down one.
				220	position = position - 1
				221	newChild.extract()
				222
				223	newChild.parent = self
				224	previousChild = None
				225	if position == 0:
				226	newChild.previousSibling = None
				227	newChild.previous = self
				228	else:
				229	previousChild = self.contents[position-1]
				230	newChild.previousSibling = previousChild
				231	newChild.previousSibling.nextSibling = newChild
				232	newChild.previous = previousChild._lastRecursiveChild()
				233	if newChild.previous:
				234	newChild.previous.next = newChild
				235
				236	newChildsLastElement = newChild._lastRecursiveChild()
				237
				238	if position >= len(self.contents):
				239	newChild.nextSibling = None
				240
				241	parent = self
				242	parentsNextSibling = None
				243	while not parentsNextSibling:
				244	parentsNextSibling = parent.nextSibling
				245	parent = parent.parent
				246	if not parent: # This is the last element in the document.
				247	break
				248	if parentsNextSibling:
				249	newChildsLastElement.next = parentsNextSibling
				250	else:
				251	newChildsLastElement.next = None
				252	else:
				253	nextChild = self.contents[position]
				254	newChild.nextSibling = nextChild
				255	if newChild.nextSibling:
				256	newChild.nextSibling.previousSibling = newChild
				257	newChildsLastElement.next = nextChild
				258
				259	if newChildsLastElement.next:
				260	newChildsLastElement.next.previous = newChildsLastElement
				261	self.contents.insert(position, newChild)
				262
				263	def append(self, tag):
				264	"""Appends the given tag to the contents of this tag."""
				265	self.insert(len(self.contents), tag)
				266
				267	def findNext(self, name=None, attrs={}, text=None, **kwargs):
				268	"""Returns the first item that matches the given criteria and
				269	appears after this Tag in the document."""
				270	return self._findOne(self.findAllNext, name, attrs, text, **kwargs)
				271
				272	def findAllNext(self, name=None, attrs={}, text=None, limit=None,
				273	**kwargs):
				274	"""Returns all items that match the given criteria and appear
				275	after this Tag in the document."""
				276	return self._findAll(name, attrs, text, limit, self.nextGenerator,
				277	**kwargs)
				278
				279	def findNextSibling(self, name=None, attrs={}, text=None, **kwargs):
				280	"""Returns the closest sibling to this Tag that matches the
				281	given criteria and appears after this Tag in the document."""
				282	return self._findOne(self.findNextSiblings, name, attrs, text,
				283	**kwargs)
				284
				285	def findNextSiblings(self, name=None, attrs={}, text=None, limit=None,
				286	**kwargs):
				287	"""Returns the siblings of this Tag that match the given
				288	criteria and appear after this Tag in the document."""
				289	return self._findAll(name, attrs, text, limit,
				290	self.nextSiblingGenerator, **kwargs)
				291	fetchNextSiblings = findNextSiblings # Compatibility with pre-3.x
				292
				293	def findPrevious(self, name=None, attrs={}, text=None, **kwargs):
				294	"""Returns the first item that matches the given criteria and
				295	appears before this Tag in the document."""
				296	return self._findOne(self.findAllPrevious, name, attrs, text, **kwargs)
				297
				298	def findAllPrevious(self, name=None, attrs={}, text=None, limit=None,
				299	**kwargs):
				300	"""Returns all items that match the given criteria and appear
				301	before this Tag in the document."""
				302	return self._findAll(name, attrs, text, limit, self.previousGenerator,
				303	**kwargs)
				304	fetchPrevious = findAllPrevious # Compatibility with pre-3.x
				305
				306	def findPreviousSibling(self, name=None, attrs={}, text=None, **kwargs):
				307	"""Returns the closest sibling to this Tag that matches the
				308	given criteria and appears before this Tag in the document."""
				309	return self._findOne(self.findPreviousSiblings, name, attrs, text,
				310	**kwargs)
				311
				312	def findPreviousSiblings(self, name=None, attrs={}, text=None,
				313	limit=None, **kwargs):
				314	"""Returns the siblings of this Tag that match the given
				315	criteria and appear before this Tag in the document."""
				316	return self._findAll(name, attrs, text, limit,
				317	self.previousSiblingGenerator, **kwargs)
				318	fetchPreviousSiblings = findPreviousSiblings # Compatibility with pre-3.x
				319
				320	def findParent(self, name=None, attrs={}, **kwargs):
				321	"""Returns the closest parent of this Tag that matches the given
				322	criteria."""
				323	# NOTE: We can't use _findOne because findParents takes a different
				324	# set of arguments.
				325	r = None
				326	l = self.findParents(name, attrs, 1)
				327	if l:
				328	r = l[0]
				329	return r
				330
				331	def findParents(self, name=None, attrs={}, limit=None, **kwargs):
				332	"""Returns the parents of this Tag that match the given
				333	criteria."""
				334
				335	return self._findAll(name, attrs, None, limit, self.parentGenerator,
				336	**kwargs)
				337	fetchParents = findParents # Compatibility with pre-3.x
				338
				339	#These methods do the real heavy lifting.
				340
				341	def _findOne(self, method, name, attrs, text, **kwargs):
				342	r = None
				343	l = method(name, attrs, text, 1, **kwargs)
				344	if l:
				345	r = l[0]
				346	return r
				347
				348	def _findAll(self, name, attrs, text, limit, generator, **kwargs):
				349	"Iterates over a generator looking for things that match."
				350
				351	if isinstance(name, SoupStrainer):
				352	strainer = name
				353	# (Possibly) special case some findAll*(...) searches
				354	elif text is None and not limit and not attrs and not kwargs:
				355	# findAll*(True)
				356	if name is True:
				357	return [element for element in generator()
				358	if isinstance(element, Tag)]
				359	# findAll*('tag-name')
				360	elif isinstance(name, basestring):
				361	return [element for element in generator()
				362	if isinstance(element, Tag) and
				363	element.name == name]
				364	else:
				365	strainer = SoupStrainer(name, attrs, text, **kwargs)
				366	# Build a SoupStrainer
				367	else:
				368	strainer = SoupStrainer(name, attrs, text, **kwargs)
				369	results = ResultSet(strainer)
				370	g = generator()
				371	while True:
				372	try:
				373	i = g.next()
				374	except StopIteration:
				375	break
				376	if i:
				377	found = strainer.search(i)
				378	if found:
				379	results.append(found)
				380	if limit and len(results) >= limit:
				381	break
				382	return results
				383
				384	#These Generators can be used to navigate starting from both
				385	#NavigableStrings and Tags.
				386	def nextGenerator(self):
				387	i = self
				388	while i is not None:
				389	i = i.next
				390	yield i
				391
				392	def nextSiblingGenerator(self):
				393	i = self
				394	while i is not None:
				395	i = i.nextSibling
				396	yield i
				397
				398	def previousGenerator(self):
				399	i = self
				400	while i is not None:
				401	i = i.previous
				402	yield i
				403
				404	def previousSiblingGenerator(self):
				405	i = self
				406	while i is not None:
				407	i = i.previousSibling
				408	yield i
				409
				410	def parentGenerator(self):
				411	i = self
				412	while i is not None:
				413	i = i.parent
				414	yield i
				415
				416	# Utility methods
				417	def substituteEncoding(self, str, encoding=None):
				418	encoding = encoding or "utf-8"
				419	return str.replace("%SOUP-ENCODING%", encoding)
				420
				421	def toEncoding(self, s, encoding=None):
				422	"""Encodes an object to a string in some encoding, or to Unicode.
				423	."""
				424	if isinstance(s, unicode):
				425	if encoding:
				426	s = s.encode(encoding)
				427	elif isinstance(s, str):
				428	if encoding:
				429	s = s.encode(encoding)
				430	else:
				431	s = unicode(s)
				432	else:
				433	if encoding:
				434	s = self.toEncoding(str(s), encoding)
				435	else:
				436	s = unicode(s)
				437	return s
				438
				439	BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]\|"
				440	+ "&(?!#\d+;\|#x[0-9a-fA-F]+;\|\w+;)"
				441	+ ")")
				442
				443	def _sub_entity(self, x):
				444	"""Used with a regular expression to substitute the
				445	appropriate XML entity for an XML special character."""
				446	return "&" + self.XML_SPECIAL_CHARS_TO_ENTITIES[x.group(0)[0]] + ";"
				447
				448
				449	class NavigableString(unicode, PageElement):
				450
				451	def __new__(cls, value):
				452	"""Create a new NavigableString.
				453
				454	When unpickling a NavigableString, this method is called with
				455	the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
				456	passed in to the superclass's __new__ or the superclass won't know
				457	how to handle non-ASCII characters.
				458	"""
				459	if isinstance(value, unicode):
				460	return unicode.__new__(cls, value)
				461	return unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
				462
				463	def __getnewargs__(self):
				464	return (NavigableString.__str__(self),)
				465
				466	def __getattr__(self, attr):
				467	"""text.string gives you text. This is for backwards
				468	compatibility for NavigableString, but for CData it lets you
				469	get the string without the CData wrapper."""
				470	if attr == 'string':
				471	return self
				472	else:
				473	raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__.__name__, attr)
				474
				475	def __unicode__(self):
				476	return str(self).decode(DEFAULT_OUTPUT_ENCODING)
				477
				478	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
				479	# Substitute outgoing XML entities.
				480	data = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, self)
				481	if encoding:
				482	return data.encode(encoding)
				483	else:
				484	return data
				485
				486	class CData(NavigableString):
				487
				488	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
				489	return "<![CDATA[%s]]>" % NavigableString.__str__(self, encoding)
				490
				491	class ProcessingInstruction(NavigableString):
				492	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
				493	output = self
				494	if "%SOUP-ENCODING%" in output:
				495	output = self.substituteEncoding(output, encoding)
				496	return "<?%s?>" % self.toEncoding(output, encoding)
				497
				498	class Comment(NavigableString):
				499	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
				500	return "<!--%s-->" % NavigableString.__str__(self, encoding)
				501
				502	class Declaration(NavigableString):
				503	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING):
				504	return "<!%s>" % NavigableString.__str__(self, encoding)
				505
				506	class Tag(PageElement):
				507
				508	"""Represents a found HTML tag with its attributes and contents."""
				509
				510	def _convertEntities(self, match):
				511	"""Used in a call to re.sub to replace HTML, XML, and numeric
				512	entities with the appropriate Unicode characters. If HTML
				513	entities are being converted, any unrecognized entities are
				514	escaped."""
				515	x = match.group(1)
				516	if self.convertHTMLEntities and x in name2codepoint:
				517	return unichr(name2codepoint[x])
				518	elif x in self.XML_ENTITIES_TO_SPECIAL_CHARS:
				519	if self.convertXMLEntities:
				520	return self.XML_ENTITIES_TO_SPECIAL_CHARS[x]
				521	else:
				522	return u'&%s;' % x
				523	elif len(x) > 0 and x[0] == '#':
				524	# Handle numeric entities
				525	if len(x) > 1 and x[1] == 'x':
				526	return unichr(int(x[2:], 16))
				527	else:
				528	return unichr(int(x[1:]))
				529
				530	elif self.escapeUnrecognizedEntities:
				531	return u'&%s;' % x
				532	else:
				533	return u'&%s;' % x
				534
				535	def __init__(self, parser, name, attrs=None, parent=None,
				536	previous=None):
				537	"Basic constructor."
				538
				539	# We don't actually store the parser object: that lets extracted
				540	# chunks be garbage-collected
				541	self.parserClass = parser.__class__
				542	self.isSelfClosing = parser.isSelfClosingTag(name)
				543	self.name = name
				544	if attrs is None:
				545	attrs = []
				546	elif isinstance(attrs, dict):
				547	attrs = attrs.items()
				548	self.attrs = attrs
				549	self.contents = []
				550	self.setup(parent, previous)
				551	self.hidden = False
				552	self.containsSubstitutions = False
				553	self.convertHTMLEntities = parser.convertHTMLEntities
				554	self.convertXMLEntities = parser.convertXMLEntities
				555	self.escapeUnrecognizedEntities = parser.escapeUnrecognizedEntities
				556
				557	# Convert any HTML, XML, or numeric entities in the attribute values.
				558	convert = lambda(k, val): (k,
				559	re.sub("&(#\d+\|#x[0-9a-fA-F]+\|\w+);",
				560	self._convertEntities,
				561	val))
				562	self.attrs = map(convert, self.attrs)
				563
				564	def getString(self):
				565	if (len(self.contents) == 1
				566	and isinstance(self.contents[0], NavigableString)):
				567	return self.contents[0]
				568
				569	def setString(self, string):
				570	"""Replace the contents of the tag with a string"""
				571	self.clear()
				572	self.append(string)
				573
				574	string = property(getString, setString)
				575
				576	def getText(self, separator=u""):
				577	if not len(self.contents):
				578	return u""
				579	stopNode = self._lastRecursiveChild().next
				580	strings = []
				581	current = self.contents[0]
				582	while current is not stopNode:
				583	if isinstance(current, NavigableString):
				584	strings.append(current.strip())
				585	current = current.next
				586	return separator.join(strings)
				587
				588	text = property(getText)
				589
				590	def get(self, key, default=None):
				591	"""Returns the value of the 'key' attribute for the tag, or
				592	the value given for 'default' if it doesn't have that
				593	attribute."""
				594	return self._getAttrMap().get(key, default)
				595
				596	def clear(self):
				597	"""Extract all children."""
				598	for child in self.contents[:]:
				599	child.extract()
				600
				601	def index(self, element):
				602	for i, child in enumerate(self.contents):
				603	if child is element:
				604	return i
				605	raise ValueError("Tag.index: element not in tag")
				606
				607	def has_key(self, key):
				608	return self._getAttrMap().has_key(key)
				609
				610	def __getitem__(self, key):
				611	"""tag[key] returns the value of the 'key' attribute for the tag,
				612	and throws an exception if it's not there."""
				613	return self._getAttrMap()[key]
				614
				615	def __iter__(self):
				616	"Iterating over a tag iterates over its contents."
				617	return iter(self.contents)
				618
				619	def __len__(self):
				620	"The length of a tag is the length of its list of contents."
				621	return len(self.contents)
				622
				623	def __contains__(self, x):
				624	return x in self.contents
				625
				626	def __nonzero__(self):
				627	"A tag is non-None even if it has no contents."
				628	return True
				629
				630	def __setitem__(self, key, value):
				631	"""Setting tag[key] sets the value of the 'key' attribute for the
				632	tag."""
				633	self._getAttrMap()
				634	self.attrMap[key] = value
				635	found = False
				636	for i in range(0, len(self.attrs)):
				637	if self.attrs[i][0] == key:
				638	self.attrs[i] = (key, value)
				639	found = True
				640	if not found:
				641	self.attrs.append((key, value))
				642	self._getAttrMap()[key] = value
				643
				644	def __delitem__(self, key):
				645	"Deleting tag[key] deletes all 'key' attributes for the tag."
				646	for item in self.attrs:
				647	if item[0] == key:
				648	self.attrs.remove(item)
				649	#We don't break because bad HTML can define the same
				650	#attribute multiple times.
				651	self._getAttrMap()
				652	if self.attrMap.has_key(key):
				653	del self.attrMap[key]
				654
				655	def __call__(self, args, *kwargs):
				656	"""Calling a tag like a function is the same as calling its
				657	findAll() method. Eg. tag('a') returns a list of all the A tags
				658	found within this tag."""
				659	return apply(self.findAll, args, kwargs)
				660
				661	def __getattr__(self, tag):
				662	#print "Getattr %s.%s" % (self.__class__, tag)
				663	if len(tag) > 3 and tag.rfind('Tag') == len(tag)-3:
				664	return self.find(tag[:-3])
				665	elif tag.find('__') != 0:
				666	return self.find(tag)
				667	raise AttributeError, "'%s' object has no attribute '%s'" % (self.__class__, tag)
				668
				669	def __eq__(self, other):
				670	"""Returns true iff this tag has the same name, the same attributes,
				671	and the same contents (recursively) as the given tag.
				672
				673	NOTE: right now this will return false if two tags have the
				674	same attributes in a different order. Should this be fixed?"""
				675	if other is self:
				676	return True
				677	if not hasattr(other, 'name') or not hasattr(other, 'attrs') or not hasattr(other, 'contents') or self.name != other.name or self.attrs != other.attrs or len(self) != len(other):
				678	return False
				679	for i in range(0, len(self.contents)):
				680	if self.contents[i] != other.contents[i]:
				681	return False
				682	return True
				683
				684	def __ne__(self, other):
				685	"""Returns true iff this tag is not identical to the other tag,
				686	as defined in __eq__."""
				687	return not self == other
				688
				689	def __repr__(self, encoding=DEFAULT_OUTPUT_ENCODING):
				690	"""Renders this tag as a string."""
				691	return self.__str__(encoding)
				692
				693	def __unicode__(self):
				694	return self.__str__(None)
				695
				696	def __str__(self, encoding=DEFAULT_OUTPUT_ENCODING,
				697	prettyPrint=False, indentLevel=0):
				698	"""Returns a string or Unicode representation of this tag and
				699	its contents. To get Unicode, pass None for encoding.
				700
				701	NOTE: since Python's HTML parser consumes whitespace, this
				702	method is not certain to reproduce the whitespace present in
				703	the original string."""
				704
				705	encodedName = self.toEncoding(self.name, encoding)
				706
				707	attrs = []
				708	if self.attrs:
				709	for key, val in self.attrs:
				710	fmt = '%s="%s"'
				711	if isinstance(val, basestring):
				712	if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
				713	val = self.substituteEncoding(val, encoding)
				714
				715	# The attribute value either:
				716	#
				717	# * Contains no embedded double quotes or single quotes.
				718	# No problem: we enclose it in double quotes.
				719	# * Contains embedded single quotes. No problem:
				720	# double quotes work here too.
				721	# * Contains embedded double quotes. No problem:
				722	# we enclose it in single quotes.
				723	# * Embeds both single _and_ double quotes. This
				724	# can't happen naturally, but it can happen if
				725	# you modify an attribute value after parsing
				726	# the document. Now we have a bit of a
				727	# problem. We solve it by enclosing the
				728	# attribute in single quotes, and escaping any
				729	# embedded single quotes to XML entities.
				730	if '"' in val:
				731	fmt = "%s='%s'"
				732	if "'" in val:
				733	# TODO: replace with apos when
				734	# appropriate.
				735	val = val.replace("'", "&squot;")
				736
				737	# Now we're okay w/r/t quotes. But the attribute
				738	# value might also contain angle brackets, or
				739	# ampersands that aren't part of entities. We need
				740	# to escape those to XML entities too.
				741	val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
				742
				743	attrs.append(fmt % (self.toEncoding(key, encoding),
				744	self.toEncoding(val, encoding)))
				745	close = ''
				746	closeTag = ''
				747	if self.isSelfClosing:
				748	close = ' /'
				749	else:
				750	closeTag = '</%s>' % encodedName
				751
				752	indentTag, indentContents = 0, 0
				753	if prettyPrint:
				754	indentTag = indentLevel
				755	space = (' ' * (indentTag-1))
				756	indentContents = indentTag + 1
				757	contents = self.renderContents(encoding, prettyPrint, indentContents)
				758	if self.hidden:
				759	s = contents
				760	else:
				761	s = []
				762	attributeString = ''
				763	if attrs:
				764	attributeString = ' ' + ' '.join(attrs)
				765	if prettyPrint:
				766	s.append(space)
				767	s.append('<%s%s%s>' % (encodedName, attributeString, close))
				768	if prettyPrint:
				769	s.append("\n")
				770	s.append(contents)
				771	if prettyPrint and contents and contents[-1] != "\n":
				772	s.append("\n")
				773	if prettyPrint and closeTag:
				774	s.append(space)
				775	s.append(closeTag)
				776	if prettyPrint and closeTag and self.nextSibling:
				777	s.append("\n")
				778	s = ''.join(s)
				779	return s
				780
				781	def decompose(self):
				782	"""Recursively destroys the contents of this tree."""
				783	self.extract()
				784	if len(self.contents) == 0:
				785	return
				786	current = self.contents[0]
				787	while current is not None:
				788	next = current.next
				789	if isinstance(current, Tag):
				790	del current.contents[:]
				791	current.parent = None
				792	current.previous = None
				793	current.previousSibling = None
				794	current.next = None
				795	current.nextSibling = None
				796	current = next
				797
				798	def prettify(self, encoding=DEFAULT_OUTPUT_ENCODING):
				799	return self.__str__(encoding, True)
				800
				801	def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
				802	prettyPrint=False, indentLevel=0):
				803	"""Renders the contents of this tag as a string in the given
				804	encoding. If encoding is None, returns a Unicode string.."""
				805	s=[]
				806	for c in self:
				807	text = None
				808	if isinstance(c, NavigableString):
				809	text = c.__str__(encoding)
				810	elif isinstance(c, Tag):
				811	s.append(c.__str__(encoding, prettyPrint, indentLevel))
				812	if text and prettyPrint:
				813	text = text.strip()
				814	if text:
				815	if prettyPrint:
				816	s.append(" " * (indentLevel-1))
				817	s.append(text)
				818	if prettyPrint:
				819	s.append("\n")
				820	return ''.join(s)
				821
				822	#Soup methods
				823
				824	def find(self, name=None, attrs={}, recursive=True, text=None,
				825	**kwargs):
				826	"""Return only the first child of this Tag matching the given
				827	criteria."""
				828	r = None
				829	l = self.findAll(name, attrs, recursive, text, 1, **kwargs)
				830	if l:
				831	r = l[0]
				832	return r
				833	findChild = find
				834
				835	def findAll(self, name=None, attrs={}, recursive=True, text=None,
				836	limit=None, **kwargs):
				837	"""Extracts a list of Tag objects that match the given
				838	criteria. You can specify the name of the Tag and any
				839	attributes you want the Tag to have.
				840
				841	The value of a key-value pair in the 'attrs' map can be a
				842	string, a list of strings, a regular expression object, or a
				843	callable that takes a string and returns whether or not the
				844	string matches for some custom definition of 'matches'. The
				845	same is true of the tag name."""
				846	generator = self.recursiveChildGenerator
				847	if not recursive:
				848	generator = self.childGenerator
				849	return self._findAll(name, attrs, text, limit, generator, **kwargs)
				850	findChildren = findAll
				851
				852	# Pre-3.x compatibility methods
				853	first = find
				854	fetch = findAll
				855
				856	def fetchText(self, text=None, recursive=True, limit=None):
				857	return self.findAll(text=text, recursive=recursive, limit=limit)
				858
				859	def firstText(self, text=None, recursive=True):
				860	return self.find(text=text, recursive=recursive)
				861
				862	#Private methods
				863
				864	def _getAttrMap(self):
				865	"""Initializes a map representation of this tag's attributes,
				866	if not already initialized."""
				867	if not getattr(self, 'attrMap'):
				868	self.attrMap = {}
				869	for (key, value) in self.attrs:
				870	self.attrMap[key] = value
				871	return self.attrMap
				872
				873	#Generator methods
				874	def childGenerator(self):
				875	# Just use the iterator from the contents
				876	return iter(self.contents)
				877
				878	def recursiveChildGenerator(self):
				879	if not len(self.contents):
				880	raise StopIteration
				881	stopNode = self._lastRecursiveChild().next
				882	current = self.contents[0]
				883	while current is not stopNode:
				884	yield current
				885	current = current.next
				886
				887
				888	# Next, a couple classes to represent queries and their results.
				889	class SoupStrainer:
				890	"""Encapsulates a number of ways of matching a markup element (tag or
				891	text)."""
				892
				893	def __init__(self, name=None, attrs={}, text=None, **kwargs):
				894	self.name = name
				895	if isinstance(attrs, basestring):
				896	kwargs['class'] = _match_css_class(attrs)
				897	attrs = None
				898	if kwargs:
				899	if attrs:
				900	attrs = attrs.copy()
				901	attrs.update(kwargs)
				902	else:
				903	attrs = kwargs
				904	self.attrs = attrs
				905	self.text = text
				906
				907	def __str__(self):
				908	if self.text:
				909	return self.text
				910	else:
				911	return "%s\|%s" % (self.name, self.attrs)
				912
				913	def searchTag(self, markupName=None, markupAttrs={}):
				914	found = None
				915	markup = None
				916	if isinstance(markupName, Tag):
				917	markup = markupName
				918	markupAttrs = markup
				919	callFunctionWithTagData = callable(self.name) \
				920	and not isinstance(markupName, Tag)
				921
				922	if (not self.name) \
				923	or callFunctionWithTagData \
				924	or (markup and self._matches(markup, self.name)) \
				925	or (not markup and self._matches(markupName, self.name)):
				926	if callFunctionWithTagData:
				927	match = self.name(markupName, markupAttrs)
				928	else:
				929	match = True
				930	markupAttrMap = None
				931	for attr, matchAgainst in self.attrs.items():
				932	if not markupAttrMap:
				933	if hasattr(markupAttrs, 'get'):
				934	markupAttrMap = markupAttrs
				935	else:
				936	markupAttrMap = {}
				937	for k,v in markupAttrs:
				938	markupAttrMap[k] = v
				939	attrValue = markupAttrMap.get(attr)
				940	if not self._matches(attrValue, matchAgainst):
				941	match = False
				942	break
				943	if match:
				944	if markup:
				945	found = markup
				946	else:
				947	found = markupName
				948	return found
				949
				950	def search(self, markup):
				951	#print 'looking for %s in %s' % (self, markup)
				952	found = None
				953	# If given a list of items, scan it for a text element that
				954	# matches.
				955	if hasattr(markup, "__iter__") \
				956	and not isinstance(markup, Tag):
				957	for element in markup:
				958	if isinstance(element, NavigableString) \
				959	and self.search(element):
				960	found = element
				961	break
				962	# If it's a Tag, make sure its name or attributes match.
				963	# Don't bother with Tags if we're searching for text.
				964	elif isinstance(markup, Tag):
				965	if not self.text:
				966	found = self.searchTag(markup)
				967	# If it's text, make sure the text matches.
				968	elif isinstance(markup, NavigableString) or \
				969	isinstance(markup, basestring):
				970	if self._matches(markup, self.text):
				971	found = markup
				972	else:
				973	raise Exception, "I don't know how to match against a %s" \
				974	% markup.__class__
				975	return found
				976
				977	def _matches(self, markup, matchAgainst):
				978	#print "Matching %s against %s" % (markup, matchAgainst)
				979	result = False
				980	if matchAgainst is True:
				981	result = markup is not None
				982	elif callable(matchAgainst):
				983	result = matchAgainst(markup)
				984	else:
				985	#Custom match methods take the tag as an argument, but all
				986	#other ways of matching match the tag name as a string.
				987	if isinstance(markup, Tag):
				988	markup = markup.name
				989	if markup and not isinstance(markup, basestring):
				990	markup = unicode(markup)
				991	#Now we know that chunk is either a string, or None.
				992	if hasattr(matchAgainst, 'match'):
				993	# It's a regexp object.
				994	result = markup and matchAgainst.search(markup)
				995	elif hasattr(matchAgainst, '__iter__'): # list-like
				996	result = markup in matchAgainst
				997	elif hasattr(matchAgainst, 'items'):
				998	result = markup.has_key(matchAgainst)
				999	elif matchAgainst and isinstance(markup, basestring):
				1000	if isinstance(markup, unicode):
				1001	matchAgainst = unicode(matchAgainst)
				1002	else:
				1003	matchAgainst = str(matchAgainst)
				1004
				1005	if not result:
				1006	result = matchAgainst == markup
				1007	return result
				1008
				1009	class ResultSet(list):
				1010	"""A ResultSet is just a list that keeps track of the SoupStrainer
				1011	that created it."""
				1012	def __init__(self, source):
				1013	list.__init__([])
				1014	self.source = source
				1015
				1016	# Now, some helper functions.
				1017
				1018	def buildTagMap(default, *args):
				1019	"""Turns a list of maps, lists, or scalars into a single map.
				1020	Used to build the SELF_CLOSING_TAGS, NESTABLE_TAGS, and
				1021	NESTING_RESET_TAGS maps out of lists and partial maps."""
				1022	built = {}
				1023	for portion in args:
				1024	if hasattr(portion, 'items'):
				1025	#It's a map. Merge it.
				1026	for k,v in portion.items():
				1027	built[k] = v
				1028	elif hasattr(portion, '__iter__'): # is a list
				1029	#It's a list. Map each item to the default.
				1030	for k in portion:
				1031	built[k] = default
				1032	else:
				1033	#It's a scalar. Map it to the default.
				1034	built[portion] = default
				1035	return built
				1036
				1037	# Now, the parser classes.
				1038
				1039	class BeautifulStoneSoup(Tag, SGMLParser):
				1040
				1041	"""This class contains the basic parser and search code. It defines
				1042	a parser that knows nothing about tag behavior except for the
				1043	following:
				1044
				1045	You can't close a tag without closing all the tags it encloses.
				1046	That is, "<foo><bar></foo>" actually means
				1047	"<foo><bar></bar></foo>".
				1048
				1049	[Another possible explanation is "<foo><bar /></foo>", but since
				1050	this class defines no SELF_CLOSING_TAGS, it will never use that
				1051	explanation.]
				1052
				1053	This class is useful for parsing XML or made-up markup languages,
				1054	or when BeautifulSoup makes an assumption counter to what you were
				1055	expecting."""
				1056
				1057	SELF_CLOSING_TAGS = {}
				1058	NESTABLE_TAGS = {}
				1059	RESET_NESTING_TAGS = {}
				1060	QUOTE_TAGS = {}
				1061	PRESERVE_WHITESPACE_TAGS = []
				1062
				1063	MARKUP_MASSAGE = [(re.compile('(<[^<>]*)/>'),
				1064	lambda x: x.group(1) + ' />'),
				1065	(re.compile('<!\s+([^<>]*)>'),
				1066	lambda x: '<!' + x.group(1) + '>')
				1067	]
				1068
				1069	ROOT_TAG_NAME = u'[document]'
				1070
				1071	HTML_ENTITIES = "html"
				1072	XML_ENTITIES = "xml"
				1073	XHTML_ENTITIES = "xhtml"
				1074	# TODO: This only exists for backwards-compatibility
				1075	ALL_ENTITIES = XHTML_ENTITIES
				1076
				1077	# Used when determining whether a text node is all whitespace and
				1078	# can be replaced with a single space. A text node that contains
				1079	# fancy Unicode spaces (usually non-breaking) should be left
				1080	# alone.
				1081	STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, }
				1082
				1083	def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None,
				1084	markupMassage=True, smartQuotesTo=XML_ENTITIES,
				1085	convertEntities=None, selfClosingTags=None, isHTML=False):
				1086	"""The Soup object is initialized as the 'root tag', and the
				1087	provided markup (which can be a string or a file-like object)
				1088	is fed into the underlying parser.
				1089
				1090	sgmllib will process most bad HTML, and the BeautifulSoup
				1091	class has some tricks for dealing with some HTML that kills
				1092	sgmllib, but Beautiful Soup can nonetheless choke or lose data
				1093	if your data uses self-closing tags or declarations
				1094	incorrectly.
				1095
				1096	By default, Beautiful Soup uses regexes to sanitize input,
				1097	avoiding the vast majority of these problems. If the problems
				1098	don't apply to you, pass in False for markupMassage, and
				1099	you'll get better performance.
				1100
				1101	The default parser massage techniques fix the two most common
				1102	instances of invalid HTML that choke sgmllib:
				1103
				1104	<br/> (No space between name of closing tag and tag close)
				1105	<! --Comment--> (Extraneous whitespace in declaration)
				1106
				1107	You can pass in a custom list of (RE object, replace method)
				1108	tuples to get Beautiful Soup to scrub your input the way you
				1109	want."""
				1110
				1111	self.parseOnlyThese = parseOnlyThese
				1112	self.fromEncoding = fromEncoding
				1113	self.smartQuotesTo = smartQuotesTo
				1114	self.convertEntities = convertEntities
				1115	# Set the rules for how we'll deal with the entities we
				1116	# encounter
				1117	if self.convertEntities:
				1118	# It doesn't make sense to convert encoded characters to
				1119	# entities even while you're converting entities to Unicode.
				1120	# Just convert it all to Unicode.
				1121	self.smartQuotesTo = None
				1122	if convertEntities == self.HTML_ENTITIES:
				1123	self.convertXMLEntities = False
				1124	self.convertHTMLEntities = True
				1125	self.escapeUnrecognizedEntities = True
				1126	elif convertEntities == self.XHTML_ENTITIES:
				1127	self.convertXMLEntities = True
				1128	self.convertHTMLEntities = True
				1129	self.escapeUnrecognizedEntities = False
				1130	elif convertEntities == self.XML_ENTITIES:
				1131	self.convertXMLEntities = True
				1132	self.convertHTMLEntities = False
				1133	self.escapeUnrecognizedEntities = False
				1134	else:
				1135	self.convertXMLEntities = False
				1136	self.convertHTMLEntities = False
				1137	self.escapeUnrecognizedEntities = False
				1138
				1139	self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags)
				1140	SGMLParser.__init__(self)
				1141
				1142	if hasattr(markup, 'read'): # It's a file-type object.
				1143	markup = markup.read()
				1144	self.markup = markup
				1145	self.markupMassage = markupMassage
				1146	try:
				1147	self._feed(isHTML=isHTML)
				1148	except StopParsing:
				1149	pass
				1150	self.markup = None # The markup can now be GCed
				1151
				1152	def convert_charref(self, name):
				1153	"""This method fixes a bug in Python's SGMLParser."""
				1154	try:
				1155	n = int(name)
				1156	except ValueError:
				1157	return
				1158	if not 0 <= n <= 127 : # ASCII ends at 127, not 255
				1159	return
				1160	return self.convert_codepoint(n)
				1161
				1162	def _feed(self, inDocumentEncoding=None, isHTML=False):
				1163	# Convert the document to Unicode.
				1164	markup = self.markup
				1165	if isinstance(markup, unicode):
				1166	if not hasattr(self, 'originalEncoding'):
				1167	self.originalEncoding = None
				1168	else:
				1169	dammit = UnicodeDammit\
				1170	(markup, [self.fromEncoding, inDocumentEncoding],
				1171	smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
				1172	markup = dammit.unicode
				1173	self.originalEncoding = dammit.originalEncoding
				1174	self.declaredHTMLEncoding = dammit.declaredHTMLEncoding
				1175	if markup:
				1176	if self.markupMassage:
				1177	if not hasattr(self.markupMassage, "__iter__"):
				1178	self.markupMassage = self.MARKUP_MASSAGE
				1179	for fix, m in self.markupMassage:
				1180	markup = fix.sub(m, markup)
				1181	# TODO: We get rid of markupMassage so that the
				1182	# soup object can be deepcopied later on. Some
				1183	# Python installations can't copy regexes. If anyone
				1184	# was relying on the existence of markupMassage, this
				1185	# might cause problems.
				1186	del(self.markupMassage)
				1187	self.reset()
				1188
				1189	SGMLParser.feed(self, markup)
				1190	# Close out any unfinished strings and close all the open tags.
				1191	self.endData()
				1192	while self.currentTag.name != self.ROOT_TAG_NAME:
				1193	self.popTag()
				1194
				1195	def __getattr__(self, methodName):
				1196	"""This method routes method call requests to either the SGMLParser
				1197	superclass or the Tag superclass, depending on the method name."""
				1198	#print "__getattr__ called on %s.%s" % (self.__class__, methodName)
				1199
				1200	if methodName.startswith('start_') or methodName.startswith('end_') \
				1201	or methodName.startswith('do_'):
				1202	return SGMLParser.__getattr__(self, methodName)
				1203	elif not methodName.startswith('__'):
				1204	return Tag.__getattr__(self, methodName)
				1205	else:
				1206	raise AttributeError
				1207
				1208	def isSelfClosingTag(self, name):
				1209	"""Returns true iff the given string is the name of a
				1210	self-closing tag according to this parser."""
				1211	return self.SELF_CLOSING_TAGS.has_key(name) \
				1212	or self.instanceSelfClosingTags.has_key(name)
				1213
				1214	def reset(self):
				1215	Tag.__init__(self, self, self.ROOT_TAG_NAME)
				1216	self.hidden = 1
				1217	SGMLParser.reset(self)
				1218	self.currentData = []
				1219	self.currentTag = None
				1220	self.tagStack = []
				1221	self.quoteStack = []
				1222	self.pushTag(self)
				1223
				1224	def popTag(self):
				1225	tag = self.tagStack.pop()
				1226
				1227	#print "Pop", tag.name
				1228	if self.tagStack:
				1229	self.currentTag = self.tagStack[-1]
				1230	return self.currentTag
				1231
				1232	def pushTag(self, tag):
				1233	#print "Push", tag.name
				1234	if self.currentTag:
				1235	self.currentTag.contents.append(tag)
				1236	self.tagStack.append(tag)
				1237	self.currentTag = self.tagStack[-1]
				1238
				1239	def endData(self, containerClass=NavigableString):
				1240	if self.currentData:
				1241	currentData = u''.join(self.currentData)
				1242	if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
				1243	not set([tag.name for tag in self.tagStack]).intersection(
				1244	self.PRESERVE_WHITESPACE_TAGS)):
				1245	if '\n' in currentData:
				1246	currentData = '\n'
				1247	else:
				1248	currentData = ' '
				1249	self.currentData = []
				1250	if self.parseOnlyThese and len(self.tagStack) <= 1 and \
				1251	(not self.parseOnlyThese.text or \
				1252	not self.parseOnlyThese.search(currentData)):
				1253	return
				1254	o = containerClass(currentData)
				1255	o.setup(self.currentTag, self.previous)
				1256	if self.previous:
				1257	self.previous.next = o
				1258	self.previous = o
				1259	self.currentTag.contents.append(o)
				1260
				1261
				1262	def _popToTag(self, name, inclusivePop=True):
				1263	"""Pops the tag stack up to and including the most recent
				1264	instance of the given tag. If inclusivePop is false, pops the tag
				1265	stack up to but not including the most recent instqance of
				1266	the given tag."""
				1267	#print "Popping to %s" % name
				1268	if name == self.ROOT_TAG_NAME:
				1269	return
				1270
				1271	numPops = 0
				1272	mostRecentTag = None
				1273	for i in range(len(self.tagStack)-1, 0, -1):
				1274	if name == self.tagStack[i].name:
				1275	numPops = len(self.tagStack)-i
				1276	break
				1277	if not inclusivePop:
				1278	numPops = numPops - 1
				1279
				1280	for i in range(0, numPops):
				1281	mostRecentTag = self.popTag()
				1282	return mostRecentTag
				1283
				1284	def _smartPop(self, name):
				1285
				1286	"""We need to pop up to the previous tag of this type, unless
				1287	one of this tag's nesting reset triggers comes between this
				1288	tag and the previous tag of this type, OR unless this tag is a
				1289	generic nesting trigger and another generic nesting trigger
				1290	comes between this tag and the previous tag of this type.
				1291
				1292	Examples:
				1293	<p>Foo<b>Bar <p> should pop to 'p', not 'b'.
				1294	<p>Foo<table>Bar <p> should pop to 'table', not 'p'.
				1295	<p>Foo<table><tr>Bar <p> should pop to 'tr', not 'p'.
				1296
				1297	<li><ul><li> <li> should pop to 'ul', not the first 'li'.
				1298	<tr><table><tr> <tr> should pop to 'table', not the first 'tr'
				1299	<td><tr><td> <td> should pop to 'tr', not the first 'td'
				1300	"""
				1301
				1302	nestingResetTriggers = self.NESTABLE_TAGS.get(name)
				1303	isNestable = nestingResetTriggers != None
				1304	isResetNesting = self.RESET_NESTING_TAGS.has_key(name)
				1305	popTo = None
				1306	inclusive = True
				1307	for i in range(len(self.tagStack)-1, 0, -1):
				1308	p = self.tagStack[i]
				1309	if (not p or p.name == name) and not isNestable:
				1310	#Non-nestable tags get popped to the top or to their
				1311	#last occurance.
				1312	popTo = name
				1313	break
				1314	if (nestingResetTriggers is not None
				1315	and p.name in nestingResetTriggers) \
				1316	or (nestingResetTriggers is None and isResetNesting
				1317	and self.RESET_NESTING_TAGS.has_key(p.name)):
				1318
				1319	#If we encounter one of the nesting reset triggers
				1320	#peculiar to this tag, or we encounter another tag
				1321	#that causes nesting to reset, pop up to but not
				1322	#including that tag.
				1323	popTo = p.name
				1324	inclusive = False
				1325	break
				1326	p = p.parent
				1327	if popTo:
				1328	self._popToTag(popTo, inclusive)
				1329
				1330	def unknown_starttag(self, name, attrs, selfClosing=0):
				1331	#print "Start tag %s: %s" % (name, attrs)
				1332	if self.quoteStack:
				1333	#This is not a real tag.
				1334	#print "<%s> is not real!" % name
				1335	attrs = ''.join([' %s="%s"' % (x, y) for x, y in attrs])
				1336	self.handle_data('<%s%s>' % (name, attrs))
				1337	return
				1338	self.endData()
				1339
				1340	if not self.isSelfClosingTag(name) and not selfClosing:
				1341	self._smartPop(name)
				1342
				1343	if self.parseOnlyThese and len(self.tagStack) <= 1 \
				1344	and (self.parseOnlyThese.text or not self.parseOnlyThese.searchTag(name, attrs)):
				1345	return
				1346
				1347	tag = Tag(self, name, attrs, self.currentTag, self.previous)
				1348	if self.previous:
				1349	self.previous.next = tag
				1350	self.previous = tag
				1351	self.pushTag(tag)
				1352	if selfClosing or self.isSelfClosingTag(name):
				1353	self.popTag()
				1354	if name in self.QUOTE_TAGS:
				1355	#print "Beginning quote (%s)" % name
				1356	self.quoteStack.append(name)
				1357	self.literal = 1
				1358	return tag
				1359
				1360	def unknown_endtag(self, name):
				1361	#print "End tag %s" % name
				1362	if self.quoteStack and self.quoteStack[-1] != name:
				1363	#This is not a real end tag.
				1364	#print "</%s> is not real!" % name
				1365	self.handle_data('</%s>' % name)
				1366	return
				1367	self.endData()
				1368	self._popToTag(name)
				1369	if self.quoteStack and self.quoteStack[-1] == name:
				1370	self.quoteStack.pop()
				1371	self.literal = (len(self.quoteStack) > 0)
				1372
				1373	def handle_data(self, data):
				1374	self.currentData.append(data)
				1375
				1376	def _toStringSubclass(self, text, subclass):
				1377	"""Adds a certain piece of text to the tree as a NavigableString
				1378	subclass."""
				1379	self.endData()
				1380	self.handle_data(text)
				1381	self.endData(subclass)
				1382
				1383	def handle_pi(self, text):
				1384	"""Handle a processing instruction as a ProcessingInstruction
				1385	object, possibly one with a %SOUP-ENCODING% slot into which an
				1386	encoding will be plugged later."""
				1387	if text[:3] == "xml":
				1388	text = u"xml version='1.0' encoding='%SOUP-ENCODING%'"
				1389	self._toStringSubclass(text, ProcessingInstruction)
				1390
				1391	def handle_comment(self, text):
				1392	"Handle comments as Comment objects."
				1393	self._toStringSubclass(text, Comment)
				1394
				1395	def handle_charref(self, ref):
				1396	"Handle character references as data."
				1397	if self.convertEntities:
				1398	data = unichr(int(ref))
				1399	else:
				1400	data = '&#%s;' % ref
				1401	self.handle_data(data)
				1402
				1403	def handle_entityref(self, ref):
				1404	"""Handle entity references as data, possibly converting known
				1405	HTML and/or XML entity references to the corresponding Unicode
				1406	characters."""
				1407	data = None
				1408	if self.convertHTMLEntities:
				1409	try:
				1410	data = unichr(name2codepoint[ref])
				1411	except KeyError:
				1412	pass
				1413
				1414	if not data and self.convertXMLEntities:
				1415	data = self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref)
				1416
				1417	if not data and self.convertHTMLEntities and \
				1418	not self.XML_ENTITIES_TO_SPECIAL_CHARS.get(ref):
				1419	# TODO: We've got a problem here. We're told this is
				1420	# an entity reference, but it's not an XML entity
				1421	# reference or an HTML entity reference. Nonetheless,
				1422	# the logical thing to do is to pass it through as an
				1423	# unrecognized entity reference.
				1424	#
				1425	# Except: when the input is "&carol;" this function
				1426	# will be called with input "carol". When the input is
				1427	# "AT&T", this function will be called with input
				1428	# "T". We have no way of knowing whether a semicolon
				1429	# was present originally, so we don't know whether
				1430	# this is an unknown entity or just a misplaced
				1431	# ampersand.
				1432	#
				1433	# The more common case is a misplaced ampersand, so I
				1434	# escape the ampersand and omit the trailing semicolon.
				1435	data = "&%s" % ref
				1436	if not data:
				1437	# This case is different from the one above, because we
				1438	# haven't already gone through a supposedly comprehensive
				1439	# mapping of entities to Unicode characters. We might not
				1440	# have gone through any mapping at all. So the chances are
				1441	# very high that this is a real entity, and not a
				1442	# misplaced ampersand.
				1443	data = "&%s;" % ref
				1444	self.handle_data(data)
				1445
				1446	def handle_decl(self, data):
				1447	"Handle DOCTYPEs and the like as Declaration objects."
				1448	self._toStringSubclass(data, Declaration)
				1449
				1450	def parse_declaration(self, i):
				1451	"""Treat a bogus SGML declaration as raw data. Treat a CDATA
				1452	declaration as a CData object."""
				1453	j = None
				1454	if self.rawdata[i:i+9] == '<![CDATA[':
				1455	k = self.rawdata.find(']]>', i)
				1456	if k == -1:
				1457	k = len(self.rawdata)
				1458	data = self.rawdata[i+9:k]
				1459	j = k+3
				1460	self._toStringSubclass(data, CData)
				1461	else:
				1462	try:
				1463	j = SGMLParser.parse_declaration(self, i)
				1464	except SGMLParseError:
				1465	toHandle = self.rawdata[i:]
				1466	self.handle_data(toHandle)
				1467	j = i + len(toHandle)
				1468	return j
				1469
				1470	class BeautifulSoup(BeautifulStoneSoup):
				1471
				1472	"""This parser knows the following facts about HTML:
				1473
				1474	* Some tags have no closing tag and should be interpreted as being
				1475	closed as soon as they are encountered.
				1476
				1477	* The text inside some tags (ie. 'script') may contain tags which
				1478	are not really part of the document and which should be parsed
				1479	as text, not tags. If you want to parse the text as tags, you can
				1480	always fetch it and parse it explicitly.
				1481
				1482	* Tag nesting rules:
				1483
				1484	Most tags can't be nested at all. For instance, the occurance of
				1485	a <p> tag should implicitly close the previous <p> tag.
				1486
				1487	<p>Para1<p>Para2
				1488	should be transformed into:
				1489	<p>Para1</p><p>Para2
				1490
				1491	Some tags can be nested arbitrarily. For instance, the occurance
				1492	of a <blockquote> tag should _not_ implicitly close the previous
				1493	<blockquote> tag.
				1494
				1495	Alice said: <blockquote>Bob said: <blockquote>Blah
				1496	should NOT be transformed into:
				1497	Alice said: <blockquote>Bob said: </blockquote><blockquote>Blah
				1498
				1499	Some tags can be nested, but the nesting is reset by the
				1500	interposition of other tags. For instance, a <tr> tag should
				1501	implicitly close the previous <tr> tag within the same <table>,
				1502	but not close a <tr> tag in another table.
				1503
				1504	<table><tr>Blah<tr>Blah
				1505	should be transformed into:
				1506	<table><tr>Blah</tr><tr>Blah
				1507	but,
				1508	<tr>Blah<table><tr>Blah
				1509	should NOT be transformed into
				1510	<tr>Blah<table></tr><tr>Blah
				1511
				1512	Differing assumptions about tag nesting rules are a major source
				1513	of problems with the BeautifulSoup class. If BeautifulSoup is not
				1514	treating as nestable a tag your page author treats as nestable,
				1515	try ICantBelieveItsBeautifulSoup, MinimalSoup, or
				1516	BeautifulStoneSoup before writing your own subclass."""
				1517
				1518	def __init__(self, args, *kwargs):
				1519	if not kwargs.has_key('smartQuotesTo'):
				1520	kwargs['smartQuotesTo'] = self.HTML_ENTITIES
				1521	kwargs['isHTML'] = True
				1522	BeautifulStoneSoup.__init__(self, args, *kwargs)
				1523
				1524	SELF_CLOSING_TAGS = buildTagMap(None,
				1525	('br' , 'hr', 'input', 'img', 'meta',
				1526	'spacer', 'link', 'frame', 'base', 'col'))
				1527
				1528	PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
				1529
				1530	QUOTE_TAGS = {'script' : None, 'textarea' : None}
				1531
				1532	#According to the HTML standard, each of these inline tags can
				1533	#contain another tag of the same type. Furthermore, it's common
				1534	#to actually use these tags this way.
				1535	NESTABLE_INLINE_TAGS = ('span', 'font', 'q', 'object', 'bdo', 'sub', 'sup',
				1536	'center')
				1537
				1538	#According to the HTML standard, these block tags can contain
				1539	#another tag of the same type. Furthermore, it's common
				1540	#to actually use these tags this way.
				1541	NESTABLE_BLOCK_TAGS = ('blockquote', 'div', 'fieldset', 'ins', 'del')
				1542
				1543	#Lists can contain other lists, but there are restrictions.
				1544	NESTABLE_LIST_TAGS = { 'ol' : [],
				1545	'ul' : [],
				1546	'li' : ['ul', 'ol'],
				1547	'dl' : [],
				1548	'dd' : ['dl'],
				1549	'dt' : ['dl'] }
				1550
				1551	#Tables can contain other tables, but there are restrictions.
				1552	NESTABLE_TABLE_TAGS = {'table' : [],
				1553	'tr' : ['table', 'tbody', 'tfoot', 'thead'],
				1554	'td' : ['tr'],
				1555	'th' : ['tr'],
				1556	'thead' : ['table'],
				1557	'tbody' : ['table'],
				1558	'tfoot' : ['table'],
				1559	}
				1560
				1561	NON_NESTABLE_BLOCK_TAGS = ('address', 'form', 'p', 'pre')
				1562
				1563	#If one of these tags is encountered, all tags up to the next tag of
				1564	#this type are popped.
				1565	RESET_NESTING_TAGS = buildTagMap(None, NESTABLE_BLOCK_TAGS, 'noscript',
				1566	NON_NESTABLE_BLOCK_TAGS,
				1567	NESTABLE_LIST_TAGS,
				1568	NESTABLE_TABLE_TAGS)
				1569
				1570	NESTABLE_TAGS = buildTagMap([], NESTABLE_INLINE_TAGS, NESTABLE_BLOCK_TAGS,
				1571	NESTABLE_LIST_TAGS, NESTABLE_TABLE_TAGS)
				1572
				1573	# Used to detect the charset in a META tag; see start_meta
				1574	CHARSET_RE = re.compile("((^\|;)\scharset=)([^;])", re.M)
				1575
				1576	def start_meta(self, attrs):
				1577	"""Beautiful Soup can detect a charset included in a META tag,
				1578	try to convert the document to that charset, and re-parse the
				1579	document from the beginning."""
				1580	httpEquiv = None
				1581	contentType = None
				1582	contentTypeIndex = None
				1583	tagNeedsEncodingSubstitution = False
				1584
				1585	for i in range(0, len(attrs)):
				1586	key, value = attrs[i]
				1587	key = key.lower()
				1588	if key == 'http-equiv':
				1589	httpEquiv = value
				1590	elif key == 'content':
				1591	contentType = value
				1592	contentTypeIndex = i
				1593
				1594	if httpEquiv and contentType: # It's an interesting meta tag.
				1595	match = self.CHARSET_RE.search(contentType)
				1596	if match:
				1597	if (self.declaredHTMLEncoding is not None or
				1598	self.originalEncoding == self.fromEncoding):
				1599	# An HTML encoding was sniffed while converting
				1600	# the document to Unicode, or an HTML encoding was
				1601	# sniffed during a previous pass through the
				1602	# document, or an encoding was specified
				1603	# explicitly and it worked. Rewrite the meta tag.
				1604	def rewrite(match):
				1605	return match.group(1) + "%SOUP-ENCODING%"
				1606	newAttr = self.CHARSET_RE.sub(rewrite, contentType)
				1607	attrs[contentTypeIndex] = (attrs[contentTypeIndex][0],
				1608	newAttr)
				1609	tagNeedsEncodingSubstitution = True
				1610	else:
				1611	# This is our first pass through the document.
				1612	# Go through it again with the encoding information.
				1613	newCharset = match.group(3)
				1614	if newCharset and newCharset != self.originalEncoding:
				1615	self.declaredHTMLEncoding = newCharset
				1616	self._feed(self.declaredHTMLEncoding)
				1617	raise StopParsing
				1618	pass
				1619	tag = self.unknown_starttag("meta", attrs)
				1620	if tag and tagNeedsEncodingSubstitution:
				1621	tag.containsSubstitutions = True
				1622
				1623	class StopParsing(Exception):
				1624	pass
				1625
				1626	class ICantBelieveItsBeautifulSoup(BeautifulSoup):
				1627
				1628	"""The BeautifulSoup class is oriented towards skipping over
				1629	common HTML errors like unclosed tags. However, sometimes it makes
				1630	errors of its own. For instance, consider this fragment:
				1631
				1632	<b>Foo<b>Bar</b></b>
				1633
				1634	This is perfectly valid (if bizarre) HTML. However, the
				1635	BeautifulSoup class will implicitly close the first b tag when it
				1636	encounters the second 'b'. It will think the author wrote
				1637	"<b>Foo<b>Bar", and didn't close the first 'b' tag, because
				1638	there's no real-world reason to bold something that's already
				1639	bold. When it encounters '</b></b>' it will close two more 'b'
				1640	tags, for a grand total of three tags closed instead of two. This
				1641	can throw off the rest of your document structure. The same is
				1642	true of a number of other tags, listed below.
				1643
				1644	It's much more common for someone to forget to close a 'b' tag
				1645	than to actually use nested 'b' tags, and the BeautifulSoup class
				1646	handles the common case. This class handles the not-co-common
				1647	case: where you can't believe someone wrote what they did, but
				1648	it's valid HTML and BeautifulSoup screwed up by assuming it
				1649	wouldn't be."""
				1650
				1651	I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS = \
				1652	('em', 'big', 'i', 'small', 'tt', 'abbr', 'acronym', 'strong',
				1653	'cite', 'code', 'dfn', 'kbd', 'samp', 'strong', 'var', 'b',
				1654	'big')
				1655
				1656	I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS = ('noscript',)
				1657
				1658	NESTABLE_TAGS = buildTagMap([], BeautifulSoup.NESTABLE_TAGS,
				1659	I_CANT_BELIEVE_THEYRE_NESTABLE_BLOCK_TAGS,
				1660	I_CANT_BELIEVE_THEYRE_NESTABLE_INLINE_TAGS)
				1661
				1662	class MinimalSoup(BeautifulSoup):
				1663	"""The MinimalSoup class is for parsing HTML that contains
				1664	pathologically bad markup. It makes no assumptions about tag
				1665	nesting, but it does know which tags are self-closing, that
				1666	<script> tags contain Javascript and should not be parsed, that
				1667	META tags may contain encoding information, and so on.
				1668
				1669	This also makes it better for subclassing than BeautifulStoneSoup
				1670	or BeautifulSoup."""
				1671
				1672	RESET_NESTING_TAGS = buildTagMap('noscript')
				1673	NESTABLE_TAGS = {}
				1674
				1675	class BeautifulSOAP(BeautifulStoneSoup):
				1676	"""This class will push a tag with only a single string child into
				1677	the tag's parent as an attribute. The attribute's name is the tag
				1678	name, and the value is the string child. An example should give
				1679	the flavor of the change:
				1680
				1681	<foo><bar>baz</bar></foo>
				1682	=>
				1683	<foo bar="baz"><bar>baz</bar></foo>
				1684
				1685	You can then access fooTag['bar'] instead of fooTag.barTag.string.
				1686
				1687	This is, of course, useful for scraping structures that tend to
				1688	use subelements instead of attributes, such as SOAP messages. Note
				1689	that it modifies its input, so don't print the modified version
				1690	out.
				1691
				1692	I'm not sure how many people really want to use this class; let me
				1693	know if you do. Mainly I like the name."""
				1694
				1695	def popTag(self):
				1696	if len(self.tagStack) > 1:
				1697	tag = self.tagStack[-1]
				1698	parent = self.tagStack[-2]
				1699	parent._getAttrMap()
				1700	if (isinstance(tag, Tag) and len(tag.contents) == 1 and
				1701	isinstance(tag.contents[0], NavigableString) and
				1702	not parent.attrMap.has_key(tag.name)):
				1703	parent[tag.name] = tag.contents[0]
				1704	BeautifulStoneSoup.popTag(self)
				1705
				1706	#Enterprise class names! It has come to our attention that some people
				1707	#think the names of the Beautiful Soup parser classes are too silly
				1708	#and "unprofessional" for use in enterprise screen-scraping. We feel
				1709	#your pain! For such-minded folk, the Beautiful Soup Consortium And
				1710	#All-Night Kosher Bakery recommends renaming this file to
				1711	#"RobustParser.py" (or, in cases of extreme enterprisiness,
				1712	#"RobustParserBeanInterface.class") and using the following
				1713	#enterprise-friendly class aliases:
				1714	class RobustXMLParser(BeautifulStoneSoup):
				1715	pass
				1716	class RobustHTMLParser(BeautifulSoup):
				1717	pass
				1718	class RobustWackAssHTMLParser(ICantBelieveItsBeautifulSoup):
				1719	pass
				1720	class RobustInsanelyWackAssHTMLParser(MinimalSoup):
				1721	pass
				1722	class SimplifyingSOAPParser(BeautifulSOAP):
				1723	pass
				1724
				1725	######################################################
				1726	#
				1727	# Bonus library: Unicode, Dammit
				1728	#
				1729	# This class forces XML data into a standard format (usually to UTF-8
				1730	# or Unicode). It is heavily based on code from Mark Pilgrim's
				1731	# Universal Feed Parser. It does not rewrite the XML or HTML to
				1732	# reflect a new encoding: that happens in BeautifulStoneSoup.handle_pi
				1733	# (XML) and BeautifulSoup.start_meta (HTML).
				1734
				1735	# Autodetects character encodings.
				1736	# Download from http://chardet.feedparser.org/
				1737	try:
				1738	import chardet
				1739	# import chardet.constants
				1740	# chardet.constants._debug = 1
				1741	except ImportError:
				1742	chardet = None
				1743
				1744	# cjkcodecs and iconv_codec make Python know about more character encodings.
				1745	# Both are available from http://cjkpython.i18n.org/
				1746	# They're built in if you use Python 2.4.
				1747	try:
				1748	import cjkcodecs.aliases
				1749	except ImportError:
				1750	pass
				1751	try:
				1752	import iconv_codec
				1753	except ImportError:
				1754	pass
				1755
				1756	class UnicodeDammit:
				1757	"""A class for detecting the encoding of a *ML document and
				1758	converting it to a Unicode string. If the source encoding is
				1759	windows-1252, can replace MS smart quotes with their HTML or XML
				1760	equivalents."""
				1761
				1762	# This dictionary maps commonly seen values for "charset" in HTML
				1763	# meta tags to the corresponding Python codec names. It only covers
				1764	# values that aren't in Python's aliases and can't be determined
				1765	# by the heuristics in find_codec.
				1766	CHARSET_ALIASES = { "macintosh" : "mac-roman",
				1767	"x-sjis" : "shift-jis" }
				1768
				1769	def __init__(self, markup, overrideEncodings=[],
				1770	smartQuotesTo='xml', isHTML=False):
				1771	self.declaredHTMLEncoding = None
				1772	self.markup, documentEncoding, sniffedEncoding = \
				1773	self._detectEncoding(markup, isHTML)
				1774	self.smartQuotesTo = smartQuotesTo
				1775	self.triedEncodings = []
				1776	if markup == '' or isinstance(markup, unicode):
				1777	self.originalEncoding = None
				1778	self.unicode = unicode(markup)
				1779	return
				1780
				1781	u = None
				1782	for proposedEncoding in overrideEncodings:
				1783	u = self._convertFrom(proposedEncoding)
				1784	if u: break
				1785	if not u:
				1786	for proposedEncoding in (documentEncoding, sniffedEncoding):
				1787	u = self._convertFrom(proposedEncoding)
				1788	if u: break
				1789
				1790	# If no luck and we have auto-detection library, try that:
				1791	if not u and chardet and not isinstance(self.markup, unicode):
				1792	u = self._convertFrom(chardet.detect(self.markup)['encoding'])
				1793
				1794	# As a last resort, try utf-8 and windows-1252:
				1795	if not u:
				1796	for proposed_encoding in ("utf-8", "windows-1252"):
				1797	u = self._convertFrom(proposed_encoding)
				1798	if u: break
				1799
				1800	self.unicode = u
				1801	if not u: self.originalEncoding = None
				1802
				1803	def _subMSChar(self, orig):
				1804	"""Changes a MS smart quote character to an XML or HTML
				1805	entity."""
				1806	sub = self.MS_CHARS.get(orig)
				1807	if isinstance(sub, tuple):
				1808	if self.smartQuotesTo == 'xml':
				1809	sub = '&#x%s;' % sub[1]
				1810	else:
				1811	sub = '&%s;' % sub[0]
				1812	return sub
				1813
				1814	def _convertFrom(self, proposed):
				1815	proposed = self.find_codec(proposed)
				1816	if not proposed or proposed in self.triedEncodings:
				1817	return None
				1818	self.triedEncodings.append(proposed)
				1819	markup = self.markup
				1820
				1821	# Convert smart quotes to HTML if coming from an encoding
				1822	# that might have them.
				1823	if self.smartQuotesTo and proposed.lower() in("windows-1252",
				1824	"iso-8859-1",
				1825	"iso-8859-2"):
				1826	markup = re.compile("([\x80-\x9f])").sub \
				1827	(lambda(x): self._subMSChar(x.group(1)),
				1828	markup)
				1829
				1830	try:
				1831	# print "Trying to convert document to %s" % proposed
				1832	u = self._toUnicode(markup, proposed)
				1833	self.markup = u
				1834	self.originalEncoding = proposed
				1835	except Exception, e:
				1836	# print "That didn't work!"
				1837	# print e
				1838	return None
				1839	#print "Correct encoding: %s" % proposed
				1840	return self.markup
				1841
				1842	def _toUnicode(self, data, encoding):
				1843	'''Given a string and its encoding, decodes the string into Unicode.
				1844	%encoding is a string recognized by encodings.aliases'''
				1845
				1846	# strip Byte Order Mark (if present)
				1847	if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
				1848	and (data[2:4] != '\x00\x00'):
				1849	encoding = 'utf-16be'
				1850	data = data[2:]
				1851	elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
				1852	and (data[2:4] != '\x00\x00'):
				1853	encoding = 'utf-16le'
				1854	data = data[2:]
				1855	elif data[:3] == '\xef\xbb\xbf':
				1856	encoding = 'utf-8'
				1857	data = data[3:]
				1858	elif data[:4] == '\x00\x00\xfe\xff':
				1859	encoding = 'utf-32be'
				1860	data = data[4:]
				1861	elif data[:4] == '\xff\xfe\x00\x00':
				1862	encoding = 'utf-32le'
				1863	data = data[4:]
				1864	newdata = unicode(data, encoding)
				1865	return newdata
				1866
				1867	def _detectEncoding(self, xml_data, isHTML=False):
				1868	"""Given a document, tries to detect its XML encoding."""
				1869	xml_encoding = sniffed_xml_encoding = None
				1870	try:
				1871	if xml_data[:4] == '\x4c\x6f\xa7\x94':
				1872	# EBCDIC
				1873	xml_data = self._ebcdic_to_ascii(xml_data)
				1874	elif xml_data[:4] == '\x00\x3c\x00\x3f':
				1875	# UTF-16BE
				1876	sniffed_xml_encoding = 'utf-16be'
				1877	xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
				1878	elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
				1879	and (xml_data[2:4] != '\x00\x00'):
				1880	# UTF-16BE with BOM
				1881	sniffed_xml_encoding = 'utf-16be'
				1882	xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
				1883	elif xml_data[:4] == '\x3c\x00\x3f\x00':
				1884	# UTF-16LE
				1885	sniffed_xml_encoding = 'utf-16le'
				1886	xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
				1887	elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
				1888	(xml_data[2:4] != '\x00\x00'):
				1889	# UTF-16LE with BOM
				1890	sniffed_xml_encoding = 'utf-16le'
				1891	xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
				1892	elif xml_data[:4] == '\x00\x00\x00\x3c':
				1893	# UTF-32BE
				1894	sniffed_xml_encoding = 'utf-32be'
				1895	xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
				1896	elif xml_data[:4] == '\x3c\x00\x00\x00':
				1897	# UTF-32LE
				1898	sniffed_xml_encoding = 'utf-32le'
				1899	xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
				1900	elif xml_data[:4] == '\x00\x00\xfe\xff':
				1901	# UTF-32BE with BOM
				1902	sniffed_xml_encoding = 'utf-32be'
				1903	xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
				1904	elif xml_data[:4] == '\xff\xfe\x00\x00':
				1905	# UTF-32LE with BOM
				1906	sniffed_xml_encoding = 'utf-32le'
				1907	xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
				1908	elif xml_data[:3] == '\xef\xbb\xbf':
				1909	# UTF-8 with BOM
				1910	sniffed_xml_encoding = 'utf-8'
				1911	xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
				1912	else:
				1913	sniffed_xml_encoding = 'ascii'
				1914	pass
				1915	except:
				1916	xml_encoding_match = None
				1917	xml_encoding_match = re.compile(
				1918	'^<\?.encoding=[\'"](.?)[\'"].*\?>').match(xml_data)
				1919	if not xml_encoding_match and isHTML:
				1920	regexp = re.compile('<\smeta[^>]+charset=([^>]?)[;\'">]', re.I)
				1921	xml_encoding_match = regexp.search(xml_data)
				1922	if xml_encoding_match is not None:
				1923	xml_encoding = xml_encoding_match.groups()[0].lower()
				1924	if isHTML:
				1925	self.declaredHTMLEncoding = xml_encoding
				1926	if sniffed_xml_encoding and \
				1927	(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
				1928	'iso-10646-ucs-4', 'ucs-4', 'csucs4',
				1929	'utf-16', 'utf-32', 'utf_16', 'utf_32',
				1930	'utf16', 'u16')):
				1931	xml_encoding = sniffed_xml_encoding
				1932	return xml_data, xml_encoding, sniffed_xml_encoding
				1933
				1934
				1935	def find_codec(self, charset):
				1936	return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
				1937	or (charset and self._codec(charset.replace("-", ""))) \
				1938	or (charset and self._codec(charset.replace("-", "_"))) \
				1939	or charset
				1940
				1941	def _codec(self, charset):
				1942	if not charset: return charset
				1943	codec = None
				1944	try:
				1945	codecs.lookup(charset)
				1946	codec = charset
				1947	except (LookupError, ValueError):
				1948	pass
				1949	return codec
				1950
				1951	EBCDIC_TO_ASCII_MAP = None
				1952	def _ebcdic_to_ascii(self, s):
				1953	c = self.__class__
				1954	if not c.EBCDIC_TO_ASCII_MAP:
				1955	emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
				1956	16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
				1957	128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
				1958	144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
				1959	32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
				1960	38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
				1961	45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
				1962	186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
				1963	195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
				1964	201,202,106,107,108,109,110,111,112,113,114,203,204,205,
				1965	206,207,208,209,126,115,116,117,118,119,120,121,122,210,
				1966	211,212,213,214,215,216,217,218,219,220,221,222,223,224,
				1967	225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
				1968	73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
				1969	82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
				1970	90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
				1971	250,251,252,253,254,255)
				1972	import string
				1973	c.EBCDIC_TO_ASCII_MAP = string.maketrans( \
				1974	''.join(map(chr, range(256))), ''.join(map(chr, emap)))
				1975	return s.translate(c.EBCDIC_TO_ASCII_MAP)
				1976
				1977	MS_CHARS = { '\x80' : ('euro', '20AC'),
				1978	'\x81' : ' ',
				1979	'\x82' : ('sbquo', '201A'),
				1980	'\x83' : ('fnof', '192'),
				1981	'\x84' : ('bdquo', '201E'),
				1982	'\x85' : ('hellip', '2026'),
				1983	'\x86' : ('dagger', '2020'),
				1984	'\x87' : ('Dagger', '2021'),
				1985	'\x88' : ('circ', '2C6'),
				1986	'\x89' : ('permil', '2030'),
				1987	'\x8A' : ('Scaron', '160'),
				1988	'\x8B' : ('lsaquo', '2039'),
				1989	'\x8C' : ('OElig', '152'),
				1990	'\x8D' : '?',
				1991	'\x8E' : ('#x17D', '17D'),
				1992	'\x8F' : '?',
				1993	'\x90' : '?',
				1994	'\x91' : ('lsquo', '2018'),
				1995	'\x92' : ('rsquo', '2019'),
				1996	'\x93' : ('ldquo', '201C'),
				1997	'\x94' : ('rdquo', '201D'),
				1998	'\x95' : ('bull', '2022'),
				1999	'\x96' : ('ndash', '2013'),
				2000	'\x97' : ('mdash', '2014'),
				2001	'\x98' : ('tilde', '2DC'),
				2002	'\x99' : ('trade', '2122'),
				2003	'\x9a' : ('scaron', '161'),
				2004	'\x9b' : ('rsaquo', '203A'),
				2005	'\x9c' : ('oelig', '153'),
				2006	'\x9d' : '?',
				2007	'\x9e' : ('#x17E', '17E'),
				2008	'\x9f' : ('Yuml', ''),}
				2009
				2010	#######################################################################
				2011
				2012
				2013	#By default, act as an HTML pretty-printer.
				2014	if __name__ == '__main__':
				2015	import sys
				2016	soup = BeautifulSoup(sys.stdin)
				2017	print soup.prettify()