Blame - Lib/markupbase.py - platform/external/python/cpython3

blob: 57d3ae4b3ce110276a8bd2153f7be04be3a34f84 [file] [log] [blame]

Fred Drake	68f8a80	2001-09-24 20:01:28 +0000	[diff] [blame]	1	"""Shared support for scanning document type declarations in HTML and XHTML."""
				2
				3	import re
				4	import string
				5
				6	_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]\s').match
				7	_declstringlit_match = re.compile(r'(\'[^\']\'\|"[^"]")\s*').match
				8
				9	del re
				10
				11
				12	class ParserBase:
				13	"""Parser base class which provides some common support methods used
				14	by the SGML/HTML and XHTML parsers."""
				15
Fred Drake	5445f07	2001-10-26 18:02:28 +0000	[diff] [blame]	16	def __init__(self):
				17	if self.__class__ is ParserBase:
				18	raise RuntimeError(
				19	"markupbase.ParserBase must be subclassed")
				20
				21	def error(self, message):
				22	raise NotImplementedError(
				23	"subclasses of ParserBase must override error()")
				24
Fred Drake	68f8a80	2001-09-24 20:01:28 +0000	[diff] [blame]	25	def reset(self):
				26	self.lineno = 1
				27	self.offset = 0
				28
				29	def getpos(self):
				30	"""Return current line number and offset."""
				31	return self.lineno, self.offset
				32
				33	# Internal -- update line number and offset. This should be
				34	# called for each piece of data exactly once, in order -- in other
				35	# words the concatenation of all the input strings to this
				36	# function should be exactly the entire input.
				37	def updatepos(self, i, j):
				38	if i >= j:
				39	return j
				40	rawdata = self.rawdata
				41	nlines = string.count(rawdata, "\n", i, j)
				42	if nlines:
				43	self.lineno = self.lineno + nlines
				44	pos = string.rindex(rawdata, "\n", i, j) # Should not fail
				45	self.offset = j-(pos+1)
				46	else:
				47	self.offset = self.offset + j-i
				48	return j
				49
				50	_decl_otherchars = ''
				51
				52	# Internal -- parse declaration (for use by subclasses).
				53	def parse_declaration(self, i):
				54	# This is some sort of declaration; in "HTML as
				55	# deployed," this should only be the document type
				56	# declaration ("<!DOCTYPE html...>").
				57	rawdata = self.rawdata
Fred Drake	68f8a80	2001-09-24 20:01:28 +0000	[diff] [blame]	58	j = i + 2
				59	assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
				60	if rawdata[j:j+1] in ("-", ""):
				61	# Start of comment followed by buffer boundary,
				62	# or just a buffer boundary.
				63	return -1
				64	# in practice, this should look like: ((name\|stringlit) S*)+ '>'
				65	n = len(rawdata)
				66	decltype, j = self._scan_name(j, i)
				67	if j < 0:
				68	return j
				69	if decltype == "doctype":
				70	self._decl_otherchars = ''
				71	while j < n:
				72	c = rawdata[j]
				73	if c == ">":
				74	# end of declaration syntax
				75	data = rawdata[i+2:j]
				76	if decltype == "doctype":
				77	self.handle_decl(data)
				78	else:
				79	self.unknown_decl(data)
				80	return j + 1
				81	if c in "\"'":
				82	m = _declstringlit_match(rawdata, j)
				83	if not m:
				84	return -1 # incomplete
				85	j = m.end()
				86	elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
				87	name, j = self._scan_name(j, i)
				88	elif c in self._decl_otherchars:
				89	j = j + 1
				90	elif c == "[":
				91	if decltype == "doctype":
				92	j = self._parse_doctype_subset(j + 1, i)
				93	else:
				94	self.error("unexpected '[' char in declaration")
				95	else:
				96	self.error(
				97	"unexpected %s char in declaration" % `rawdata[j]`)
				98	if j < 0:
				99	return j
				100	return -1 # incomplete
				101
				102	# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
				103	# returning the index just past any whitespace following the trailing ']'.
				104	def _parse_doctype_subset(self, i, declstartpos):
				105	rawdata = self.rawdata
				106	n = len(rawdata)
				107	j = i
				108	while j < n:
				109	c = rawdata[j]
				110	if c == "<":
				111	s = rawdata[j:j+2]
				112	if s == "<":
				113	# end of buffer; incomplete
				114	return -1
				115	if s != "<!":
				116	self.updatepos(declstartpos, j + 1)
				117	self.error("unexpected char in internal subset (in %s)"
				118	% `s`)
				119	if (j + 2) == n:
				120	# end of buffer; incomplete
				121	return -1
				122	if (j + 4) > n:
				123	# end of buffer; incomplete
				124	return -1
				125	if rawdata[j:j+4] == "<!--":
				126	j = self.parse_comment(j, report=0)
				127	if j < 0:
				128	return j
				129	continue
				130	name, j = self._scan_name(j + 2, declstartpos)
				131	if j == -1:
				132	return -1
				133	if name not in ("attlist", "element", "entity", "notation"):
				134	self.updatepos(declstartpos, j + 2)
				135	self.error(
				136	"unknown declaration %s in internal subset" % `name`)
				137	# handle the individual names
				138	meth = getattr(self, "_parse_doctype_" + name)
				139	j = meth(j, declstartpos)
				140	if j < 0:
				141	return j
				142	elif c == "%":
				143	# parameter entity reference
				144	if (j + 1) == n:
				145	# end of buffer; incomplete
				146	return -1
				147	s, j = self._scan_name(j + 1, declstartpos)
				148	if j < 0:
				149	return j
				150	if rawdata[j] == ";":
				151	j = j + 1
				152	elif c == "]":
				153	j = j + 1
				154	while j < n and rawdata[j] in string.whitespace:
				155	j = j + 1
				156	if j < n:
				157	if rawdata[j] == ">":
				158	return j
				159	self.updatepos(declstartpos, j)
				160	self.error("unexpected char after internal subset")
				161	else:
				162	return -1
				163	elif c in string.whitespace:
				164	j = j + 1
				165	else:
				166	self.updatepos(declstartpos, j)
				167	self.error("unexpected char %s in internal subset" % `c`)
				168	# end of buffer reached
				169	return -1
				170
				171	# Internal -- scan past <!ELEMENT declarations
				172	def _parse_doctype_element(self, i, declstartpos):
Fred Drake	68f8a80	2001-09-24 20:01:28 +0000	[diff] [blame]	173	name, j = self._scan_name(i, declstartpos)
				174	if j == -1:
				175	return -1
				176	# style content model; just skip until '>'
Fred Drake	5445f07	2001-10-26 18:02:28 +0000	[diff] [blame]	177	rawdata = self.rawdata
Fred Drake	68f8a80	2001-09-24 20:01:28 +0000	[diff] [blame]	178	if '>' in rawdata[j:]:
				179	return string.find(rawdata, ">", j) + 1
				180	return -1
				181
				182	# Internal -- scan past <!ATTLIST declarations
				183	def _parse_doctype_attlist(self, i, declstartpos):
				184	rawdata = self.rawdata
				185	name, j = self._scan_name(i, declstartpos)
				186	c = rawdata[j:j+1]
				187	if c == "":
				188	return -1
				189	if c == ">":
				190	return j + 1
				191	while 1:
				192	# scan a series of attribute descriptions; simplified:
				193	# name type [value] [#constraint]
				194	name, j = self._scan_name(j, declstartpos)
				195	if j < 0:
				196	return j
				197	c = rawdata[j:j+1]
				198	if c == "":
				199	return -1
				200	if c == "(":
				201	# an enumerated type; look for ')'
				202	if ")" in rawdata[j:]:
				203	j = string.find(rawdata, ")", j) + 1
				204	else:
				205	return -1
				206	while rawdata[j:j+1] in string.whitespace:
				207	j = j + 1
				208	if not rawdata[j:]:
				209	# end of buffer, incomplete
				210	return -1
				211	else:
				212	name, j = self._scan_name(j, declstartpos)
				213	c = rawdata[j:j+1]
				214	if not c:
				215	return -1
				216	if c in "'\"":
				217	m = _declstringlit_match(rawdata, j)
				218	if m:
				219	j = m.end()
				220	else:
				221	return -1
				222	c = rawdata[j:j+1]
				223	if not c:
				224	return -1
				225	if c == "#":
				226	if rawdata[j:] == "#":
				227	# end of buffer
				228	return -1
				229	name, j = self._scan_name(j + 1, declstartpos)
				230	if j < 0:
				231	return j
				232	c = rawdata[j:j+1]
				233	if not c:
				234	return -1
				235	if c == '>':
				236	# all done
				237	return j + 1
				238
				239	# Internal -- scan past <!NOTATION declarations
				240	def _parse_doctype_notation(self, i, declstartpos):
				241	name, j = self._scan_name(i, declstartpos)
				242	if j < 0:
				243	return j
				244	rawdata = self.rawdata
				245	while 1:
				246	c = rawdata[j:j+1]
				247	if not c:
				248	# end of buffer; incomplete
				249	return -1
				250	if c == '>':
				251	return j + 1
				252	if c in "'\"":
				253	m = _declstringlit_match(rawdata, j)
				254	if not m:
				255	return -1
				256	j = m.end()
				257	else:
				258	name, j = self._scan_name(j, declstartpos)
				259	if j < 0:
				260	return j
				261
				262	# Internal -- scan past <!ENTITY declarations
				263	def _parse_doctype_entity(self, i, declstartpos):
				264	rawdata = self.rawdata
				265	if rawdata[i:i+1] == "%":
				266	j = i + 1
				267	while 1:
				268	c = rawdata[j:j+1]
				269	if not c:
				270	return -1
				271	if c in string.whitespace:
				272	j = j + 1
				273	else:
				274	break
				275	else:
				276	j = i
				277	name, j = self._scan_name(j, declstartpos)
				278	if j < 0:
				279	return j
				280	while 1:
				281	c = self.rawdata[j:j+1]
				282	if not c:
				283	return -1
				284	if c in "'\"":
				285	m = _declstringlit_match(rawdata, j)
				286	if m:
				287	j = m.end()
				288	else:
				289	return -1 # incomplete
				290	elif c == ">":
				291	return j + 1
				292	else:
				293	name, j = self._scan_name(j, declstartpos)
				294	if j < 0:
				295	return j
				296
				297	# Internal -- scan a name token and the new position and the token, or
				298	# return -1 if we've reached the end of the buffer.
				299	def _scan_name(self, i, declstartpos):
				300	rawdata = self.rawdata
				301	n = len(rawdata)
				302	if i == n:
				303	return None, -1
				304	m = _declname_match(rawdata, i)
				305	if m:
				306	s = m.group()
				307	name = s.strip()
				308	if (i + len(s)) == n:
				309	return None, -1 # end of buffer
Fred Drake	1cffd5c	2001-09-24 20:04:29 +0000	[diff] [blame]	310	return string.lower(name), m.end()
Fred Drake	68f8a80	2001-09-24 20:01:28 +0000	[diff] [blame]	311	else:
				312	self.updatepos(declstartpos, i)
Fred Drake	3d32be1	2001-10-13 15:59:47 +0000	[diff] [blame]	313	self.error("expected name token")
Fred Drake	5445f07	2001-10-26 18:02:28 +0000	[diff] [blame]	314
				315	# To be overridden -- handlers for unknown objects
				316	def unknown_decl(self, data):
				317	pass