Blame - Lib/HTMLParser.py - platform/external/python/cpython3

blob: 363a6723a8b35406ef8d3b07a57d7f2cecf96bf5 [file] [log] [blame]

Guido van Rossum	8846d71	2001-05-18 14:50:52 +0000	[diff] [blame^]	1	"""A parser for HTML."""
				2
				3	# This file is based on sgmllib.py, but the API is slightly different.
				4
				5	# XXX There should be a way to distinguish between PCDATA (parsed
				6	# character data -- the normal case), RCDATA (replaceable character
				7	# data -- only char and entity references and end tags are special)
				8	# and CDATA (character data -- only end tags are special).
				9
				10
				11	import re
				12	import string
				13
				14	# Regular expressions used for parsing
				15
				16	interesting_normal = re.compile('[&<]')
				17	interesting_cdata = re.compile(r'<(/\|\Z)')
				18	incomplete = re.compile('&([a-zA-Z][-.a-zA-Z0-9]\|#[0-9])?')
				19
				20	entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
				21	charref = re.compile('&#([0-9]+)[^0-9]')
				22
				23	starttagopen = re.compile('<[a-zA-Z]')
				24	piopen = re.compile(r'<\?')
				25	piclose = re.compile('>')
				26	endtagopen = re.compile('</')
				27	declopen = re.compile('<!')
				28	special = re.compile('<![^<>]*>')
				29	commentopen = re.compile('<!--')
				30	commentclose = re.compile(r'--\s*>')
				31	tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
				32	attrfind = re.compile(
				33	r'\s([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s'
				34	r'(\'[^\']\'\|"[^"]"\|[-a-zA-Z0-9./:;+%?!&$_#=~]))?')
				35
				36	locatestarttagend = re.compile(r"""
				37	<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
				38	(?:\s+ # whitespace before attribute name
				39	(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
				40	(?:\s=\s # value indicator
				41	(?:'[^']*' # LITA-enclosed value
				42	\|\"[^\"]*\" # LIT-enclosed value
				43	\|[^'\">\s]+ # bare value
				44	)
				45	)?
				46	)
				47	)*
				48	\s* # trailing whitespace
				49	""", re.VERBOSE)
				50	endstarttag = re.compile(r"\s*/?>")
				51	endendtag = re.compile('>')
				52	endtagfind = re.compile('</\s([a-zA-Z][-.a-zA-Z0-9:_])\s*>')
				53
				54	declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]\s')
				55	declstringlit = re.compile(r'(\'[^\']\'\|"[^"]")\s*')
				56
				57
				58	class HTMLParseError(Exception):
				59	"""Exception raised for all parse errors."""
				60
				61	def __init__(self, msg, position=(None, None)):
				62	assert msg
				63	self.msg = msg
				64	self.lineno = position[0]
				65	self.offset = position[1]
				66
				67	def __str__(self):
				68	result = self.msg
				69	if self.lineno is not None:
				70	result = result + ", at line %d" % self.lineno
				71	if self.offset is not None:
				72	result = result + ", column %d" % (self.offset + 1)
				73	return result
				74
				75
				76	# HTML parser class -- find tags and call handler functions.
				77	# Usage: p = HTMLParser(); p.feed(data); ...; p.close().
				78	# The dtd is defined by deriving a class which defines methods
				79	# with special names to handle tags: start_foo and end_foo to handle
				80	# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
				81	# (Tags are converted to lower case for this purpose.) The data
				82	# between tags is passed to the parser by calling self.handle_data()
				83	# with some data as argument (the data may be split up in arbitrary
				84	# chunks). Entity references are passed by calling
				85	# self.handle_entityref() with the entity reference as argument.
				86
				87	class HTMLParser:
				88
				89	CDATA_CONTENT_ELEMENTS = ("script", "style")
				90
				91
				92	# Interface -- initialize and reset this instance
				93	def __init__(self):
				94	self.reset()
				95
				96	# Interface -- reset this instance. Loses all unprocessed data
				97	def reset(self):
				98	self.rawdata = ''
				99	self.stack = []
				100	self.lasttag = '???'
				101	self.lineno = 1
				102	self.offset = 0
				103	self.interesting = interesting_normal
				104
				105	# Interface -- feed some data to the parser. Call this as
				106	# often as you want, with as little or as much text as you
				107	# want (may include '\n'). (This just saves the text, all the
				108	# processing is done by goahead().)
				109	def feed(self, data):
				110	self.rawdata = self.rawdata + data
				111	self.goahead(0)
				112
				113	# Interface -- handle the remaining data
				114	def close(self):
				115	self.goahead(1)
				116
				117	# Internal -- update line number and offset. This should be
				118	# called for each piece of data exactly once, in order -- in other
				119	# words the concatenation of all the input strings to this
				120	# function should be exactly the entire input.
				121	def updatepos(self, i, j):
				122	if i >= j:
				123	return j
				124	rawdata = self.rawdata
				125	nlines = string.count(rawdata, "\n", i, j)
				126	if nlines:
				127	self.lineno = self.lineno + nlines
				128	pos = string.rindex(rawdata, "\n", i, j) # Should not fail
				129	self.offset = j-(pos+1)
				130	else:
				131	self.offset = self.offset + j-i
				132	return j
				133
				134	# Interface -- return current line number and offset.
				135	def getpos(self):
				136	return self.lineno, self.offset
				137
				138	__starttag_text = None
				139
				140	# Interface -- return full source of start tag: "<...>"
				141	def get_starttag_text(self):
				142	return self.__starttag_text
				143
				144	def set_cdata_mode(self):
				145	self.interesting = interesting_cdata
				146
				147	def clear_cdata_mode(self):
				148	self.interesting = interesting_normal
				149
				150	# Internal -- handle data as far as reasonable. May leave state
				151	# and data to be processed by a subsequent call. If 'end' is
				152	# true, force handling all data as if followed by EOF marker.
				153	def goahead(self, end):
				154	rawdata = self.rawdata
				155	i = 0
				156	n = len(rawdata)
				157	while i < n:
				158	match = self.interesting.search(rawdata, i) # < or &
				159	if match:
				160	j = match.start()
				161	else:
				162	j = n
				163	if i < j: self.handle_data(rawdata[i:j])
				164	i = self.updatepos(i, j)
				165	if i == n: break
				166	if rawdata[i] == '<':
				167	if starttagopen.match(rawdata, i): # < + letter
				168	k = self.parse_starttag(i)
				169	elif endtagopen.match(rawdata, i): # </
				170	k = self.parse_endtag(i)
				171	if k >= 0:
				172	self.clear_cdata_mode()
				173	elif commentopen.match(rawdata, i): # <!--
				174	k = self.parse_comment(i)
				175	elif piopen.match(rawdata, i): # <?
				176	k = self.parse_pi(i)
				177	elif declopen.match(rawdata, i): # <!
				178	k = self.parse_declaration(i)
				179	else:
				180	if i < n-1:
				181	raise HTMLParseError(
				182	"invalid '<' construct: %s" % `rawdata[i:i+2]`,
				183	self.getpos())
				184	k = -1
				185	if k < 0:
				186	if end:
				187	raise HTMLParseError("EOF in middle of construct",
				188	self.getpos())
				189	break
				190	i = self.updatepos(i, k)
				191	elif rawdata[i] == '&':
				192	match = charref.match(rawdata, i)
				193	if match:
				194	name = match.group(1)
				195	self.handle_charref(name)
				196	k = match.end()
				197	if rawdata[k-1] != ';':
				198	k = k-1
				199	i = self.updatepos(i, k)
				200	continue
				201	match = entityref.match(rawdata, i)
				202	if match:
				203	name = match.group(1)
				204	self.handle_entityref(name)
				205	k = match.end()
				206	if rawdata[k-1] != ';':
				207	k = k-1
				208	i = self.updatepos(i, k)
				209	continue
				210	if incomplete.match(rawdata, i):
				211	if end:
				212	raise HTMLParseError(
				213	"EOF in middle of entity or char ref",
				214	self.getpos())
				215	return -1 # incomplete
				216	raise HTMLParseError("'&' not part of entity or char ref",
				217	self.getpos())
				218	else:
				219	assert 0, "interesting.search() lied"
				220	# end while
				221	if end and i < n:
				222	self.handle_data(rawdata[i:n])
				223	i = self.updatepos(i, n)
				224	self.rawdata = rawdata[i:]
				225
				226	# Internal -- parse comment, return end or -1 if not terminated
				227	def parse_comment(self, i):
				228	rawdata = self.rawdata
				229	assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
				230	match = commentclose.search(rawdata, i+4)
				231	if not match:
				232	return -1
				233	j = match.start()
				234	self.handle_comment(rawdata[i+4: j])
				235	j = match.end()
				236	return j
				237
				238	# Internal -- parse declaration.
				239	def parse_declaration(self, i):
				240	# This is some sort of declaration; in "HTML as
				241	# deployed," this should only be the document type
				242	# declaration ("<!DOCTYPE html...>").
				243	rawdata = self.rawdata
				244	j = i + 2
				245	assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
				246	if rawdata[j:j+1] in ("-", ""):
				247	# Start of comment followed by buffer boundary,
				248	# or just a buffer boundary.
				249	return -1
				250	# in practice, this should look like: ((name\|stringlit) S*)+ '>'
				251	n = len(rawdata)
				252	while j < n:
				253	c = rawdata[j]
				254	if c == ">":
				255	# end of declaration syntax
				256	self.handle_decl(rawdata[i+2:j])
				257	return j + 1
				258	if c in "\"'":
				259	m = declstringlit.match(rawdata, j)
				260	if not m:
				261	return -1 # incomplete
				262	j = m.end()
				263	elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
				264	m = declname.match(rawdata, j)
				265	if not m:
				266	return -1 # incomplete
				267	j = m.end()
				268	else:
				269	raise HTMLParseError(
				270	"unexpected char in declaration: %s" % `rawdata[j]`,
				271	self.getpos())
				272	return -1 # incomplete
				273
				274	# Internal -- parse processing instr, return end or -1 if not terminated
				275	def parse_pi(self, i):
				276	rawdata = self.rawdata
				277	assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
				278	match = piclose.search(rawdata, i+2) # >
				279	if not match:
				280	return -1
				281	j = match.start()
				282	self.handle_pi(rawdata[i+2: j])
				283	j = match.end()
				284	return j
				285
				286	# Internal -- handle starttag, return end or -1 if not terminated
				287	def parse_starttag(self, i):
				288	self.__starttag_text = None
				289	endpos = self.check_for_whole_start_tag(i)
				290	if endpos < 0:
				291	return endpos
				292	rawdata = self.rawdata
				293	self.__starttag_text = rawdata[i:endpos]
				294
				295	# Now parse the data between i+1 and j into a tag and attrs
				296	attrs = []
				297	match = tagfind.match(rawdata, i+1)
				298	assert match, 'unexpected call to parse_starttag()'
				299	k = match.end()
				300	self.lasttag = tag = string.lower(rawdata[i+1:k])
				301
				302	while k < endpos:
				303	m = attrfind.match(rawdata, k)
				304	if not m:
				305	break
				306	attrname, rest, attrvalue = m.group(1, 2, 3)
				307	if not rest:
				308	attrvalue = None
				309	elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
				310	attrvalue[:1] == '"' == attrvalue[-1:]:
				311	attrvalue = attrvalue[1:-1]
				312	attrvalue = self.unescape(attrvalue)
				313	attrs.append((string.lower(attrname), attrvalue))
				314	k = m.end()
				315
				316	end = string.strip(rawdata[k:endpos])
				317	if end not in (">", "/>"):
				318	lineno, offset = self.getpos()
				319	if "\n" in self.__starttag_text:
				320	lineno = lineno + string.count(self.__starttag_text, "\n")
				321	offset = len(self.__starttag_text) \
				322	- string.rfind(self.__starttag_text, "\n")
				323	else:
				324	offset = offset + len(self.__starttag_text)
				325	raise HTMLParseError("junk characters in start tag: %s"
				326	% `rawdata[k:endpos][:20]`,
				327	(lineno, offset))
				328	if end[-2:] == '/>':
				329	# XHTML-style empty tag: <span attr="value" />
				330	self.handle_startendtag(tag, attrs)
				331	else:
				332	self.handle_starttag(tag, attrs)
				333	if tag in self.CDATA_CONTENT_ELEMENTS:
				334	self.set_cdata_mode()
				335	return endpos
				336
				337	# Internal -- check to see if we have a complete starttag; return end
				338	# or -1 if incomplete.
				339	def check_for_whole_start_tag(self, i):
				340	rawdata = self.rawdata
				341	m = locatestarttagend.match(rawdata, i)
				342	if m:
				343	j = m.end()
				344	next = rawdata[j:j+1]
				345	if next == ">":
				346	return j + 1
				347	if next == "/":
				348	s = rawdata[j:j+2]
				349	if s == "/>":
				350	return j + 2
				351	if s == "/":
				352	# buffer boundary
				353	return -1
				354	# else bogus input
				355	self.updatepos(i, j + 1)
				356	raise HTMLParseError("malformed empty start tag",
				357	self.getpos())
				358	if next == "":
				359	# end of input
				360	return -1
				361	if next in ("abcdefghijklmnopqrstuvwxyz=/"
				362	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
				363	# end of input in or before attribute value, or we have the
				364	# '/' from a '/>' ending
				365	return -1
				366	self.updatepos(i, j)
				367	raise HTMLParseError("malformed start tag", self.getpos())
				368	raise AssertionError("we should not gt here!")
				369
				370	# Internal -- parse endtag, return end or -1 if incomplete
				371	def parse_endtag(self, i):
				372	rawdata = self.rawdata
				373	assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
				374	match = endendtag.search(rawdata, i+1) # >
				375	if not match:
				376	return -1
				377	j = match.end()
				378	match = endtagfind.match(rawdata, i) # </ + tag + >
				379	if not match:
				380	raise HTMLParseError("bad end tag: %s" % `rawdata[i:j]`,
				381	self.getpos())
				382	tag = match.group(1)
				383	self.handle_endtag(string.lower(tag))
				384	return j
				385
				386	# Overridable -- finish processing of start+end tag: <tag.../>
				387	def handle_startendtag(self, tag, attrs):
				388	self.handle_starttag(tag, attrs)
				389	self.handle_endtag(tag)
				390
				391	# Overridable -- handle start tag
				392	def handle_starttag(self, tag, attrs):
				393	pass
				394
				395	# Overridable -- handle end tag
				396	def handle_endtag(self, tag):
				397	pass
				398
				399	# Overridable -- handle character reference
				400	def handle_charref(self, name):
				401	pass
				402
				403	# Overridable -- handle entity reference
				404	def handle_entityref(self, name):
				405	pass
				406
				407	# Overridable -- handle data
				408	def handle_data(self, data):
				409	pass
				410
				411	# Overridable -- handle comment
				412	def handle_comment(self, data):
				413	pass
				414
				415	# Overridable -- handle declaration
				416	def handle_decl(self, decl):
				417	pass
				418
				419	# Overridable -- handle processing instruction
				420	def handle_pi(self, data):
				421	pass
				422
				423	# Internal -- helper to remove special character quoting
				424	def unescape(self, s):
				425	if '&' not in s:
				426	return s
				427	s = string.replace(s, "<", "<")
				428	s = string.replace(s, ">", ">")
				429	s = string.replace(s, "'", "'")
				430	s = string.replace(s, """, '"')
				431	s = string.replace(s, "&", "&") # Must be last
				432	return s