Blame - Lib/xmllib.py - platform/external/python/cpython3

blob: 38328affce5061751665dceca992a0fc1510321b [file] [log] [blame]

Guido van Rossum	a219efa	1997-11-18 15:09:54 +0000	[diff] [blame]	1	# A parser for XML, using the derived class as static DTD.
				2	# Author: Sjoerd Mullender
				3
				4	import re
				5	import string
				6
				7
				8	# Regular expressions used for parsing
				9
				10	_S = '[ \t\r\n]+'
				11	_opS = '[ \t\r\n]*'
				12	_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'
				13	interesting = re.compile('[&<]')
				14	incomplete = re.compile('&(' + _Name + '\|#[0-9]\|#x[0-9a-fA-F])?\|'
				15	'<([a-zA-Z_:][^<>]*\|'
				16	'/([a-zA-Z_:][^<>]*)?\|'
				17	'![^<>]*\|'
				18	'\?[^<>]*)?')
				19
				20	ref = re.compile('&(' + _Name + '\|#[0-9]+\|#x[0-9a-fA-F]+);?')
				21	entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
				22	charref = re.compile('&#(?P<char>[0-9]+[^0-9]\|x[0-9a-fA-F]+[^0-9a-fA-F])')
				23	space = re.compile(_S)
				24	newline = re.compile('\n')
				25
				26	starttagopen = re.compile('<' + _Name)
				27	endtagopen = re.compile('</')
				28	starttagend = re.compile(_opS + '(?P<slash>/?)>')
				29	endbracket = re.compile('>')
				30	tagfind = re.compile(_Name)
				31	cdataopen = re.compile('<!\[CDATA\[')
				32	cdataclose = re.compile('\]\]>')
				33	special = re.compile('<!(?P<special>[^<>]*)>')
				34	procopen = re.compile('<\?(?P<proc>' + _Name + ')' + _S)
				35	procclose = re.compile('\?>')
				36	commentopen = re.compile('<!--')
				37	commentclose = re.compile('-->')
				38	doubledash = re.compile('--')
				39	attrfind = re.compile(
				40	_S + '(?P<name>' + _Name + ')'
				41	'(' + _opS + '=' + _opS +
				42	'(?P<value>\'[^\']\'\|"[^"]"\|[-a-zA-Z0-9.:+*%?!()_#=~]+))')
				43
				44
				45	# XML parser base class -- find tags and call handler functions.
				46	# Usage: p = XMLParser(); p.feed(data); ...; p.close().
				47	# The dtd is defined by deriving a class which defines methods
				48	# with special names to handle tags: start_foo and end_foo to handle
				49	# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
				50	# (Tags are converted to lower case for this purpose.) The data
				51	# between tags is passed to the parser by calling self.handle_data()
				52	# with some data as argument (the data may be split up in arbutrary
				53	# chunks). Entity references are passed by calling
				54	# self.handle_entityref() with the entity reference as argument.
				55
				56	class XMLParser:
				57
				58	# Interface -- initialize and reset this instance
				59	def __init__(self, verbose=0):
				60	self.verbose = verbose
				61	self.reset()
				62
				63	# Interface -- reset this instance. Loses all unprocessed data
				64	def reset(self):
				65	self.rawdata = ''
				66	self.stack = []
				67	self.lasttag = '???'
				68	self.nomoretags = 0
				69	self.literal = 0
				70	self.lineno = 1
				71
				72	# For derived classes only -- enter literal mode (CDATA) till EOF
				73	def setnomoretags(self):
				74	self.nomoretags = self.literal = 1
				75
				76	# For derived classes only -- enter literal mode (CDATA)
				77	def setliteral(self, *args):
				78	self.literal = 1
				79
				80	# Interface -- feed some data to the parser. Call this as
				81	# often as you want, with as little or as much text as you
				82	# want (may include '\n'). (This just saves the text, all the
				83	# processing is done by goahead().)
				84	def feed(self, data):
				85	self.rawdata = self.rawdata + data
				86	self.goahead(0)
				87
				88	# Interface -- handle the remaining data
				89	def close(self):
				90	self.goahead(1)
				91
				92	# Interface -- translate references
				93	def translate_references(self, data):
				94	newdata = []
				95	i = 0
				96	while 1:
				97	res = ref.search(data, i)
				98	if res is None:
				99	newdata.append(data[i:])
				100	return string.join(newdata, '')
				101	if data[res.end(0) - 1] != ';':
				102	self.syntax_error(self.lineno,
				103	'; missing after entity/char reference')
				104	newdata.append(data[i:res.start(0)])
				105	str = res.group(1)
				106	if str[0] == '#':
				107	if str[1] == 'x':
				108	newdata.append(chr(string.atoi(str[2:], 16)))
				109	else:
				110	newdata.append(chr(string.atoi(str[1:])))
				111	else:
				112	try:
				113	newdata.append(self.entitydefs[str])
				114	except KeyError:
				115	# can't do it, so keep the entity ref in
				116	newdata.append('&' + str + ';')
				117	i = res.end(0)
				118
				119	# Internal -- handle data as far as reasonable. May leave state
				120	# and data to be processed by a subsequent call. If 'end' is
				121	# true, force handling all data as if followed by EOF marker.
				122	def goahead(self, end):
				123	rawdata = self.rawdata
				124	i = 0
				125	n = len(rawdata)
				126	while i < n:
				127	if self.nomoretags:
				128	data = rawdata[i:n]
				129	self.handle_data(data)
				130	self.lineno = self.lineno + string.count(data, '\n')
				131	i = n
				132	break
				133	res = interesting.search(rawdata, i)
				134	if res:
				135	j = res.start(0)
				136	else:
				137	j = n
				138	if i < j:
				139	data = rawdata[i:j]
				140	self.handle_data(data)
				141	self.lineno = self.lineno + string.count(data, '\n')
				142	i = j
				143	if i == n: break
				144	if rawdata[i] == '<':
				145	if starttagopen.match(rawdata, i):
				146	if self.literal:
				147	data = rawdata[i]
				148	self.handle_data(data)
				149	self.lineno = self.lineno + string.count(data, '\n')
				150	i = i+1
				151	continue
				152	k = self.parse_starttag(i)
				153	if k < 0: break
				154	self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
				155	i = k
				156	continue
				157	if endtagopen.match(rawdata, i):
				158	k = self.parse_endtag(i)
				159	if k < 0: break
				160	self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
				161	i = k
				162	self.literal = 0
				163	continue
				164	if commentopen.match(rawdata, i):
				165	if self.literal:
				166	data = rawdata[i]
				167	self.handle_data(data)
				168	self.lineno = self.lineno + string.count(data, '\n')
				169	i = i+1
				170	continue
				171	k = self.parse_comment(i)
				172	if k < 0: break
				173	self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
				174	i = k
				175	continue
				176	if cdataopen.match(rawdata, i):
				177	k = self.parse_cdata(i)
				178	if k < 0: break
				179	self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
				180	i = k
				181	continue
				182	res = procopen.match(rawdata, i)
				183	if res:
				184	k = self.parse_proc(i, res)
				185	if k < 0: break
				186	self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
				187	i = k
				188	continue
				189	res = special.match(rawdata, i)
				190	if res:
				191	if self.literal:
				192	data = rawdata[i]
				193	self.handle_data(data)
				194	self.lineno = self.lineno + string.count(data, '\n')
				195	i = i+1
				196	continue
				197	self.handle_special(res.group('special'))
				198	self.lineno = self.lineno + string.count(res.group(0), '\n')
				199	i = res.end(0)
				200	continue
				201	elif rawdata[i] == '&':
				202	res = charref.match(rawdata, i)
				203	if res is not None:
				204	i = res.end(0)
				205	if rawdata[i-1] != ';':
				206	self.syntax_error(self.lineno, '; missing in charref')
				207	i = i-1
				208	self.handle_charref(res.group('char')[:-1])
				209	self.lineno = self.lineno + string.count(res.group(0), '\n')
				210	continue
				211	res = entityref.match(rawdata, i)
				212	if res is not None:
				213	i = res.end(0)
				214	if rawdata[i-1] != ';':
				215	self.syntax_error(self.lineno, '; missing in entityref')
				216	i = i-1
				217	self.handle_entityref(res.group('name'))
				218	self.lineno = self.lineno + string.count(res.group(0), '\n')
				219	continue
				220	else:
				221	raise RuntimeError, 'neither < nor & ??'
				222	# We get here only if incomplete matches but
				223	# nothing else
				224	res = incomplete.match(rawdata, i)
				225	if not res:
				226	data = rawdata[i]
				227	self.handle_data(data)
				228	self.lineno = self.lineno + string.count(data, '\n')
				229	i = i+1
				230	continue
				231	j = res.end(0)
				232	if j == n:
				233	break # Really incomplete
				234	self.syntax_error(self.lineno, 'bogus < or &')
				235	data = res.group(0)
				236	self.handle_data(data)
				237	self.lineno = self.lineno + string.count(data, '\n')
				238	i = j
				239	# end while
				240	if end and i < n:
				241	data = rawdata[i:n]
				242	self.handle_data(data)
				243	self.lineno = self.lineno + string.count(data, '\n')
				244	i = n
				245	self.rawdata = rawdata[i:]
				246	# XXX if end: check for empty stack
				247
				248	# Internal -- parse comment, return length or -1 if not terminated
				249	def parse_comment(self, i):
				250	rawdata = self.rawdata
				251	if rawdata[i:i+4] <> '<!--':
				252	raise RuntimeError, 'unexpected call to handle_comment'
				253	res = commentclose.search(rawdata, i+4)
				254	if not res:
				255	return -1
				256	# doubledash search will succeed because it's a subset of commentclose
				257	if doubledash.search(rawdata, i+4).start(0) < res.start(0):
				258	self.syntax_error(self.lineno, "`--' inside comment")
				259	self.handle_comment(rawdata[i+4: res.start(0)])
				260	return res.end(0)
				261
				262	# Internal -- handle CDATA tag, return lenth or -1 if not terminated
				263	def parse_cdata(self, i):
				264	rawdata = self.rawdata
				265	if rawdata[i:i+9] <> '<![CDATA[':
				266	raise RuntimeError, 'unexpected call to handle_cdata'
				267	res = cdataclose.search(rawdata, i+9)
				268	if not res:
				269	return -1
				270	self.handle_cdata(rawdata[i+9:res.start(0)])
				271	return res.end(0)
				272
				273	def parse_proc(self, i, res):
				274	rawdata = self.rawdata
				275	if not res:
				276	raise RuntimeError, 'unexpected call to parse_proc'
				277	name = res.group('proc')
				278	res = procclose.search(rawdata, res.end(0))
				279	if not res:
				280	return -1
				281	self.handle_proc(name, rawdata[res.pos:res.start(0)])
				282	return res.end(0)
				283
				284	# Internal -- handle starttag, return length or -1 if not terminated
				285	def parse_starttag(self, i):
				286	rawdata = self.rawdata
				287	# i points to start of tag
				288	end = endbracket.search(rawdata, i+1)
				289	if not end:
				290	return -1
				291	j = end.start(0)
				292	# Now parse the data between i+1 and j into a tag and attrs
				293	attrdict = {}
				294	res = tagfind.match(rawdata, i+1)
				295	if not res:
				296	raise RuntimeError, 'unexpected call to parse_starttag'
				297	k = res.end(0)
				298	tag = res.group(0)
				299	if hasattr(self, tag + '_attributes'):
				300	attrlist = getattr(self, tag + '_attributes')
				301	else:
				302	attrlist = None
				303	self.lasttag = tag
				304	while k < j:
				305	res = attrfind.match(rawdata, k)
				306	if not res: break
				307	attrname, attrvalue = res.group('name', 'value')
				308	if attrvalue is None:
				309	self.syntax_error(self.lineno, 'no attribute value specified')
				310	attrvalue = attrname
				311	elif attrvalue[:1] == "'" == attrvalue[-1:] or \
				312	attrvalue[:1] == '"' == attrvalue[-1:]:
				313	attrvalue = attrvalue[1:-1]
				314	else:
				315	self.syntax_error(self.lineno, 'attribute value not quoted')
				316	# XXXX are attribute names case sensitive?
				317	attrname = string.lower(attrname)
				318	if attrlist is not None and attrname not in attrlist:
				319	self.syntax_error(self.lineno,
				320	'unknown attribute %s of element %s' %
				321	(attrname, tag))
				322	if attrdict.has_key(attrname):
				323	self.syntax_error(self.lineno, 'attribute specified twice')
				324	attrdict[attrname] = self.translate_references(attrvalue)
				325	k = res.end(0)
				326	res = starttagend.match(rawdata, k)
				327	if not res:
				328	self.syntax_error(self.lineno, 'garbage in start tag')
				329	self.finish_starttag(tag, attrdict)
				330	if res and res.group('slash') == '/':
				331	self.finish_endtag(tag)
				332	return end.end(0)
				333
				334	# Internal -- parse endtag
				335	def parse_endtag(self, i):
				336	rawdata = self.rawdata
				337	end = endbracket.search(rawdata, i+1)
				338	if not end:
				339	return -1
				340	res = tagfind.match(rawdata, i+2)
				341	if not res:
				342	self.syntax_error(self.lineno, 'no name specified in end tag')
				343	tag = ''
				344	k = i+2
				345	else:
				346	tag = res.group(0)
				347	k = res.end(0)
				348	if k != end.start(0):
				349	# check that there is only white space at end of tag
				350	res = space.match(rawdata, k)
				351	if res is None or res.end(0) != end.start(0):
				352	self.syntax_error(self.lineno, 'garbage in end tag')
				353	self.finish_endtag(tag)
				354	return end.end(0)
				355
				356	# Internal -- finish processing of start tag
				357	# Return -1 for unknown tag, 1 for balanced tag
				358	def finish_starttag(self, tag, attrs):
				359	self.stack.append(tag)
				360	try:
				361	method = getattr(self, 'start_' + tag)
				362	except AttributeError:
				363	self.unknown_starttag(tag, attrs)
				364	return -1
				365	else:
				366	self.handle_starttag(tag, method, attrs)
				367	return 1
				368
				369	# Internal -- finish processing of end tag
				370	def finish_endtag(self, tag):
				371	if not tag:
				372	found = len(self.stack) - 1
				373	if found < 0:
				374	self.unknown_endtag(tag)
				375	return
				376	else:
				377	if tag not in self.stack:
				378	try:
				379	method = getattr(self, 'end_' + tag)
				380	except AttributeError:
				381	self.unknown_endtag(tag)
				382	return
				383	found = len(self.stack)
				384	for i in range(found):
				385	if self.stack[i] == tag: found = i
				386	while len(self.stack) > found:
				387	tag = self.stack[-1]
				388	try:
				389	method = getattr(self, 'end_' + tag)
				390	except AttributeError:
				391	method = None
				392	if method:
				393	self.handle_endtag(tag, method)
				394	else:
				395	self.unknown_endtag(tag)
				396	del self.stack[-1]
				397
				398	# Overridable -- handle start tag
				399	def handle_starttag(self, tag, method, attrs):
				400	method(attrs)
				401
				402	# Overridable -- handle end tag
				403	def handle_endtag(self, tag, method):
				404	method()
				405
				406	# Example -- handle character reference, no need to override
				407	def handle_charref(self, name):
				408	try:
				409	if name[0] == 'x':
				410	n = string.atoi(name[1:], 16)
				411	else:
				412	n = string.atoi(name)
				413	except string.atoi_error:
				414	self.unknown_charref(name)
				415	return
				416	if not 0 <= n <= 255:
				417	self.unknown_charref(name)
				418	return
				419	self.handle_data(chr(n))
				420
				421	# Definition of entities -- derived classes may override
				422	entitydefs = \
				423	{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
				424
				425	# Example -- handle entity reference, no need to override
				426	def handle_entityref(self, name):
				427	table = self.entitydefs
				428	if table.has_key(name):
				429	self.handle_data(table[name])
				430	else:
				431	self.unknown_entityref(name)
				432	return
				433
				434	# Example -- handle data, should be overridden
				435	def handle_data(self, data):
				436	pass
				437
				438	# Example -- handle cdata, could be overridden
				439	def handle_cdata(self, data):
				440	pass
				441
				442	# Example -- handle comment, could be overridden
				443	def handle_comment(self, data):
				444	pass
				445
				446	# Example -- handle processing instructions, could be overridden
				447	def handle_proc(self, name, data):
				448	pass
				449
				450	# Example -- handle special instructions, could be overridden
				451	def handle_special(self, data):
				452	pass
				453
				454	# Example -- handle relatively harmless syntax errors, could be overridden
				455	def syntax_error(self, lineno, message):
				456	raise RuntimeError, 'Syntax error at line %d: %s' % (lineno, message)
				457
				458	# To be overridden -- handlers for unknown objects
				459	def unknown_starttag(self, tag, attrs): pass
				460	def unknown_endtag(self, tag): pass
				461	def unknown_charref(self, ref): pass
				462	def unknown_entityref(self, ref): pass
				463
				464
				465	class TestXMLParser(XMLParser):
				466
				467	def __init__(self, verbose=0):
				468	self.testdata = ""
				469	XMLParser.__init__(self, verbose)
				470
				471	def handle_data(self, data):
				472	self.testdata = self.testdata + data
				473	if len(`self.testdata`) >= 70:
				474	self.flush()
				475
				476	def flush(self):
				477	data = self.testdata
				478	if data:
				479	self.testdata = ""
				480	print 'data:', `data`
				481
				482	def handle_cdata(self, data):
				483	self.flush()
				484	print 'cdata:', `data`
				485
				486	def handle_proc(self, name, data):
				487	self.flush()
				488	print 'processing:',name,`data`
				489
				490	def handle_special(self, data):
				491	self.flush()
				492	print 'special:',`data`
				493
				494	def handle_comment(self, data):
				495	self.flush()
				496	r = `data`
				497	if len(r) > 68:
				498	r = r[:32] + '...' + r[-32:]
				499	print 'comment:', r
				500
				501	def syntax_error(self, lineno, message):
				502	print 'error at line %d:' % lineno, message
				503
				504	def unknown_starttag(self, tag, attrs):
				505	self.flush()
				506	if not attrs:
				507	print 'start tag: <' + tag + '>'
				508	else:
				509	print 'start tag: <' + tag,
				510	for name, value in attrs:
				511	print name + '=' + '"' + value + '"',
				512	print '>'
				513
				514	def unknown_endtag(self, tag):
				515	self.flush()
				516	print 'end tag: </' + tag + '>'
				517
				518	def unknown_entityref(self, ref):
				519	self.flush()
				520	print '*** unknown entity ref: &' + ref + ';'
				521
				522	def unknown_charref(self, ref):
				523	self.flush()
				524	print '*** unknown char ref: &#' + ref + ';'
				525
				526	def close(self):
				527	XMLParser.close(self)
				528	self.flush()
				529
				530	def test(args = None):
				531	import sys
				532
				533	if not args:
				534	args = sys.argv[1:]
				535
				536	if args and args[0] == '-s':
				537	args = args[1:]
				538	klass = XMLParser
				539	else:
				540	klass = TestXMLParser
				541
				542	if args:
				543	file = args[0]
				544	else:
				545	file = 'test.xml'
				546
				547	if file == '-':
				548	f = sys.stdin
				549	else:
				550	try:
				551	f = open(file, 'r')
				552	except IOError, msg:
				553	print file, ":", msg
				554	sys.exit(1)
				555
				556	data = f.read()
				557	if f is not sys.stdin:
				558	f.close()
				559
				560	x = klass()
				561	for c in data:
				562	x.feed(c)
				563	x.close()
				564
				565
				566	if __name__ == '__main__':
				567	test()
				568