Blame - Lib/HTMLParser.py - platform/external/python/cpython2

blob: 339c132207afd23e9bbc3f95d8cbfa9cbd1a5450 [file] [log] [blame]

Guido van Rossum	8846d71	2001-05-18 14:50:52 +0000	[diff] [blame]	1	"""A parser for HTML."""
				2
				3	# This file is based on sgmllib.py, but the API is slightly different.
				4
				5	# XXX There should be a way to distinguish between PCDATA (parsed
				6	# character data -- the normal case), RCDATA (replaceable character
				7	# data -- only char and entity references and end tags are special)
				8	# and CDATA (character data -- only end tags are special).
				9
				10
				11	import re
				12	import string
				13
				14	# Regular expressions used for parsing
				15
				16	interesting_normal = re.compile('[&<]')
				17	interesting_cdata = re.compile(r'<(/\|\Z)')
				18	incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]\|#[0-9])?')
				19
				20	entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
				21	charref = re.compile('&#([0-9]+)[^0-9]')
				22
				23	starttagopen = re.compile('<[a-zA-Z]')
				24	piopen = re.compile(r'<\?')
				25	piclose = re.compile('>')
				26	endtagopen = re.compile('</')
				27	declopen = re.compile('<!')
				28	special = re.compile('<![^<>]*>')
				29	commentopen = re.compile('<!--')
				30	commentclose = re.compile(r'--\s*>')
				31	tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
				32	attrfind = re.compile(
				33	r'\s([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s'
				34	r'(\'[^\']\'\|"[^"]"\|[-a-zA-Z0-9./:;+%?!&$_#=~]))?')
				35
				36	locatestarttagend = re.compile(r"""
				37	<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
				38	(?:\s+ # whitespace before attribute name
				39	(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
				40	(?:\s=\s # value indicator
				41	(?:'[^']*' # LITA-enclosed value
				42	\|\"[^\"]*\" # LIT-enclosed value
				43	\|[^'\">\s]+ # bare value
				44	)
				45	)?
				46	)
				47	)*
				48	\s* # trailing whitespace
				49	""", re.VERBOSE)
				50	endstarttag = re.compile(r"\s*/?>")
				51	endendtag = re.compile('>')
				52	endtagfind = re.compile('</\s([a-zA-Z][-.a-zA-Z0-9:_])\s*>')
				53
				54	declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]\s')
				55	declstringlit = re.compile(r'(\'[^\']\'\|"[^"]")\s*')
				56
				57
				58	class HTMLParseError(Exception):
				59	"""Exception raised for all parse errors."""
				60
				61	def __init__(self, msg, position=(None, None)):
				62	assert msg
				63	self.msg = msg
				64	self.lineno = position[0]
				65	self.offset = position[1]
				66
				67	def __str__(self):
				68	result = self.msg
				69	if self.lineno is not None:
				70	result = result + ", at line %d" % self.lineno
				71	if self.offset is not None:
				72	result = result + ", column %d" % (self.offset + 1)
				73	return result
				74
				75
				76	# HTML parser class -- find tags and call handler functions.
Fred Drake	1c48eb7	2001-05-23 04:53:44 +0000	[diff] [blame]	77	# Usage:
				78	#
				79	# p = HTMLParser(); p.feed(data); ...; p.close()
Guido van Rossum	07f353c	2001-05-22 23:39:10 +0000	[diff] [blame]	80
				81	# Start tags are handled by calling self.handle_starttag() or
				82	# self.handle_startendtag(); end tags by self.handle_endtag(). The
Fred Drake	1c48eb7	2001-05-23 04:53:44 +0000	[diff] [blame]	83	# data between tags is passed from the parser to the derived class by
				84	# calling self.handle_data() with the data as argument (the data may
				85	# be split up in arbitrary chunks). Entity references are passed by
				86	# calling self.handle_entityref() with the entity reference as the
				87	# argument. Numeric character references are passed to
				88	# self.handle_charref() with the string containing the reference as
				89	# the argument.
Guido van Rossum	8846d71	2001-05-18 14:50:52 +0000	[diff] [blame]	90
				91	class HTMLParser:
				92
				93	CDATA_CONTENT_ELEMENTS = ("script", "style")
				94
				95
				96	# Interface -- initialize and reset this instance
				97	def __init__(self):
				98	self.reset()
				99
				100	# Interface -- reset this instance. Loses all unprocessed data
				101	def reset(self):
				102	self.rawdata = ''
				103	self.stack = []
				104	self.lasttag = '???'
				105	self.lineno = 1
				106	self.offset = 0
				107	self.interesting = interesting_normal
				108
				109	# Interface -- feed some data to the parser. Call this as
				110	# often as you want, with as little or as much text as you
				111	# want (may include '\n'). (This just saves the text, all the
				112	# processing is done by goahead().)
				113	def feed(self, data):
				114	self.rawdata = self.rawdata + data
				115	self.goahead(0)
				116
				117	# Interface -- handle the remaining data
				118	def close(self):
				119	self.goahead(1)
				120
				121	# Internal -- update line number and offset. This should be
				122	# called for each piece of data exactly once, in order -- in other
				123	# words the concatenation of all the input strings to this
				124	# function should be exactly the entire input.
				125	def updatepos(self, i, j):
				126	if i >= j:
				127	return j
				128	rawdata = self.rawdata
				129	nlines = string.count(rawdata, "\n", i, j)
				130	if nlines:
				131	self.lineno = self.lineno + nlines
				132	pos = string.rindex(rawdata, "\n", i, j) # Should not fail
				133	self.offset = j-(pos+1)
				134	else:
				135	self.offset = self.offset + j-i
				136	return j
				137
				138	# Interface -- return current line number and offset.
				139	def getpos(self):
				140	return self.lineno, self.offset
				141
				142	__starttag_text = None
				143
				144	# Interface -- return full source of start tag: "<...>"
				145	def get_starttag_text(self):
				146	return self.__starttag_text
				147
				148	def set_cdata_mode(self):
				149	self.interesting = interesting_cdata
				150
				151	def clear_cdata_mode(self):
				152	self.interesting = interesting_normal
				153
				154	# Internal -- handle data as far as reasonable. May leave state
				155	# and data to be processed by a subsequent call. If 'end' is
				156	# true, force handling all data as if followed by EOF marker.
				157	def goahead(self, end):
				158	rawdata = self.rawdata
				159	i = 0
				160	n = len(rawdata)
				161	while i < n:
				162	match = self.interesting.search(rawdata, i) # < or &
				163	if match:
				164	j = match.start()
				165	else:
				166	j = n
				167	if i < j: self.handle_data(rawdata[i:j])
				168	i = self.updatepos(i, j)
				169	if i == n: break
				170	if rawdata[i] == '<':
				171	if starttagopen.match(rawdata, i): # < + letter
				172	k = self.parse_starttag(i)
				173	elif endtagopen.match(rawdata, i): # </
				174	k = self.parse_endtag(i)
				175	if k >= 0:
				176	self.clear_cdata_mode()
				177	elif commentopen.match(rawdata, i): # <!--
				178	k = self.parse_comment(i)
				179	elif piopen.match(rawdata, i): # <?
				180	k = self.parse_pi(i)
				181	elif declopen.match(rawdata, i): # <!
				182	k = self.parse_declaration(i)
				183	else:
				184	if i < n-1:
				185	raise HTMLParseError(
				186	"invalid '<' construct: %s" % `rawdata[i:i+2]`,
				187	self.getpos())
				188	k = -1
				189	if k < 0:
				190	if end:
				191	raise HTMLParseError("EOF in middle of construct",
				192	self.getpos())
				193	break
				194	i = self.updatepos(i, k)
				195	elif rawdata[i] == '&':
				196	match = charref.match(rawdata, i)
				197	if match:
				198	name = match.group(1)
				199	self.handle_charref(name)
				200	k = match.end()
				201	if rawdata[k-1] != ';':
				202	k = k-1
				203	i = self.updatepos(i, k)
				204	continue
				205	match = entityref.match(rawdata, i)
				206	if match:
				207	name = match.group(1)
				208	self.handle_entityref(name)
				209	k = match.end()
				210	if rawdata[k-1] != ';':
				211	k = k-1
				212	i = self.updatepos(i, k)
				213	continue
				214	if incomplete.match(rawdata, i):
				215	if end:
				216	raise HTMLParseError(
				217	"EOF in middle of entity or char ref",
				218	self.getpos())
				219	return -1 # incomplete
				220	raise HTMLParseError("'&' not part of entity or char ref",
				221	self.getpos())
				222	else:
				223	assert 0, "interesting.search() lied"
				224	# end while
				225	if end and i < n:
				226	self.handle_data(rawdata[i:n])
				227	i = self.updatepos(i, n)
				228	self.rawdata = rawdata[i:]
				229
				230	# Internal -- parse comment, return end or -1 if not terminated
				231	def parse_comment(self, i):
				232	rawdata = self.rawdata
				233	assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
				234	match = commentclose.search(rawdata, i+4)
				235	if not match:
				236	return -1
				237	j = match.start()
				238	self.handle_comment(rawdata[i+4: j])
				239	j = match.end()
				240	return j
				241
				242	# Internal -- parse declaration.
				243	def parse_declaration(self, i):
				244	# This is some sort of declaration; in "HTML as
				245	# deployed," this should only be the document type
				246	# declaration ("<!DOCTYPE html...>").
				247	rawdata = self.rawdata
				248	j = i + 2
				249	assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
				250	if rawdata[j:j+1] in ("-", ""):
				251	# Start of comment followed by buffer boundary,
				252	# or just a buffer boundary.
				253	return -1
				254	# in practice, this should look like: ((name\|stringlit) S*)+ '>'
				255	n = len(rawdata)
				256	while j < n:
				257	c = rawdata[j]
				258	if c == ">":
				259	# end of declaration syntax
				260	self.handle_decl(rawdata[i+2:j])
				261	return j + 1
				262	if c in "\"'":
				263	m = declstringlit.match(rawdata, j)
				264	if not m:
				265	return -1 # incomplete
				266	j = m.end()
				267	elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
				268	m = declname.match(rawdata, j)
				269	if not m:
				270	return -1 # incomplete
				271	j = m.end()
				272	else:
				273	raise HTMLParseError(
				274	"unexpected char in declaration: %s" % `rawdata[j]`,
				275	self.getpos())
				276	return -1 # incomplete
				277
				278	# Internal -- parse processing instr, return end or -1 if not terminated
				279	def parse_pi(self, i):
				280	rawdata = self.rawdata
				281	assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
				282	match = piclose.search(rawdata, i+2) # >
				283	if not match:
				284	return -1
				285	j = match.start()
				286	self.handle_pi(rawdata[i+2: j])
				287	j = match.end()
				288	return j
				289
				290	# Internal -- handle starttag, return end or -1 if not terminated
				291	def parse_starttag(self, i):
				292	self.__starttag_text = None
				293	endpos = self.check_for_whole_start_tag(i)
				294	if endpos < 0:
				295	return endpos
				296	rawdata = self.rawdata
				297	self.__starttag_text = rawdata[i:endpos]
				298
				299	# Now parse the data between i+1 and j into a tag and attrs
				300	attrs = []
				301	match = tagfind.match(rawdata, i+1)
				302	assert match, 'unexpected call to parse_starttag()'
				303	k = match.end()
				304	self.lasttag = tag = string.lower(rawdata[i+1:k])
				305
				306	while k < endpos:
				307	m = attrfind.match(rawdata, k)
				308	if not m:
				309	break
				310	attrname, rest, attrvalue = m.group(1, 2, 3)
				311	if not rest:
				312	attrvalue = None
				313	elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
				314	attrvalue[:1] == '"' == attrvalue[-1:]:
				315	attrvalue = attrvalue[1:-1]
				316	attrvalue = self.unescape(attrvalue)
				317	attrs.append((string.lower(attrname), attrvalue))
				318	k = m.end()
				319
				320	end = string.strip(rawdata[k:endpos])
				321	if end not in (">", "/>"):
				322	lineno, offset = self.getpos()
				323	if "\n" in self.__starttag_text:
				324	lineno = lineno + string.count(self.__starttag_text, "\n")
				325	offset = len(self.__starttag_text) \
				326	- string.rfind(self.__starttag_text, "\n")
				327	else:
				328	offset = offset + len(self.__starttag_text)
				329	raise HTMLParseError("junk characters in start tag: %s"
				330	% `rawdata[k:endpos][:20]`,
				331	(lineno, offset))
				332	if end[-2:] == '/>':
				333	# XHTML-style empty tag: <span attr="value" />
				334	self.handle_startendtag(tag, attrs)
				335	else:
				336	self.handle_starttag(tag, attrs)
				337	if tag in self.CDATA_CONTENT_ELEMENTS:
				338	self.set_cdata_mode()
				339	return endpos
				340
				341	# Internal -- check to see if we have a complete starttag; return end
				342	# or -1 if incomplete.
				343	def check_for_whole_start_tag(self, i):
				344	rawdata = self.rawdata
				345	m = locatestarttagend.match(rawdata, i)
				346	if m:
				347	j = m.end()
				348	next = rawdata[j:j+1]
				349	if next == ">":
				350	return j + 1
				351	if next == "/":
				352	s = rawdata[j:j+2]
				353	if s == "/>":
				354	return j + 2
				355	if s == "/":
				356	# buffer boundary
				357	return -1
				358	# else bogus input
				359	self.updatepos(i, j + 1)
				360	raise HTMLParseError("malformed empty start tag",
				361	self.getpos())
				362	if next == "":
				363	# end of input
				364	return -1
				365	if next in ("abcdefghijklmnopqrstuvwxyz=/"
				366	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
				367	# end of input in or before attribute value, or we have the
				368	# '/' from a '/>' ending
				369	return -1
				370	self.updatepos(i, j)
				371	raise HTMLParseError("malformed start tag", self.getpos())
				372	raise AssertionError("we should not gt here!")
				373
				374	# Internal -- parse endtag, return end or -1 if incomplete
				375	def parse_endtag(self, i):
				376	rawdata = self.rawdata
				377	assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
				378	match = endendtag.search(rawdata, i+1) # >
				379	if not match:
				380	return -1
				381	j = match.end()
				382	match = endtagfind.match(rawdata, i) # </ + tag + >
				383	if not match:
				384	raise HTMLParseError("bad end tag: %s" % `rawdata[i:j]`,
				385	self.getpos())
				386	tag = match.group(1)
				387	self.handle_endtag(string.lower(tag))
				388	return j
				389
				390	# Overridable -- finish processing of start+end tag: <tag.../>
				391	def handle_startendtag(self, tag, attrs):
				392	self.handle_starttag(tag, attrs)
				393	self.handle_endtag(tag)
				394
				395	# Overridable -- handle start tag
				396	def handle_starttag(self, tag, attrs):
				397	pass
				398
				399	# Overridable -- handle end tag
				400	def handle_endtag(self, tag):
				401	pass
				402
				403	# Overridable -- handle character reference
				404	def handle_charref(self, name):
				405	pass
				406
				407	# Overridable -- handle entity reference
				408	def handle_entityref(self, name):
				409	pass
				410
				411	# Overridable -- handle data
				412	def handle_data(self, data):
				413	pass
				414
				415	# Overridable -- handle comment
				416	def handle_comment(self, data):
				417	pass
				418
				419	# Overridable -- handle declaration
				420	def handle_decl(self, decl):
				421	pass
				422
				423	# Overridable -- handle processing instruction
				424	def handle_pi(self, data):
				425	pass
				426
				427	# Internal -- helper to remove special character quoting
				428	def unescape(self, s):
				429	if '&' not in s:
				430	return s
				431	s = string.replace(s, "<", "<")
				432	s = string.replace(s, ">", ">")
				433	s = string.replace(s, "'", "'")
				434	s = string.replace(s, """, '"')
				435	s = string.replace(s, "&", "&") # Must be last
				436	return s