Blame - Lib/markupbase.py - platform/external/python/cpython3

blob: 2055676fdbbc6beb954fa60c745ab55d8b1bfffd [file] [log] [blame]

Fred Drake	68f8a80	2001-09-24 20:01:28 +0000	[diff] [blame]	1	"""Shared support for scanning document type declarations in HTML and XHTML."""
				2
				3	import re
				4	import string
				5
				6	_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]\s').match
				7	_declstringlit_match = re.compile(r'(\'[^\']\'\|"[^"]")\s*').match
				8
				9	del re
				10
				11
				12	class ParserBase:
				13	"""Parser base class which provides some common support methods used
				14	by the SGML/HTML and XHTML parsers."""
				15
				16	def reset(self):
				17	self.lineno = 1
				18	self.offset = 0
				19
				20	def getpos(self):
				21	"""Return current line number and offset."""
				22	return self.lineno, self.offset
				23
				24	# Internal -- update line number and offset. This should be
				25	# called for each piece of data exactly once, in order -- in other
				26	# words the concatenation of all the input strings to this
				27	# function should be exactly the entire input.
				28	def updatepos(self, i, j):
				29	if i >= j:
				30	return j
				31	rawdata = self.rawdata
				32	nlines = string.count(rawdata, "\n", i, j)
				33	if nlines:
				34	self.lineno = self.lineno + nlines
				35	pos = string.rindex(rawdata, "\n", i, j) # Should not fail
				36	self.offset = j-(pos+1)
				37	else:
				38	self.offset = self.offset + j-i
				39	return j
				40
				41	_decl_otherchars = ''
				42
				43	# Internal -- parse declaration (for use by subclasses).
				44	def parse_declaration(self, i):
				45	# This is some sort of declaration; in "HTML as
				46	# deployed," this should only be the document type
				47	# declaration ("<!DOCTYPE html...>").
				48	rawdata = self.rawdata
				49	import sys
				50	j = i + 2
				51	assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
				52	if rawdata[j:j+1] in ("-", ""):
				53	# Start of comment followed by buffer boundary,
				54	# or just a buffer boundary.
				55	return -1
				56	# in practice, this should look like: ((name\|stringlit) S*)+ '>'
				57	n = len(rawdata)
				58	decltype, j = self._scan_name(j, i)
				59	if j < 0:
				60	return j
				61	if decltype == "doctype":
				62	self._decl_otherchars = ''
				63	while j < n:
				64	c = rawdata[j]
				65	if c == ">":
				66	# end of declaration syntax
				67	data = rawdata[i+2:j]
				68	if decltype == "doctype":
				69	self.handle_decl(data)
				70	else:
				71	self.unknown_decl(data)
				72	return j + 1
				73	if c in "\"'":
				74	m = _declstringlit_match(rawdata, j)
				75	if not m:
				76	return -1 # incomplete
				77	j = m.end()
				78	elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
				79	name, j = self._scan_name(j, i)
				80	elif c in self._decl_otherchars:
				81	j = j + 1
				82	elif c == "[":
				83	if decltype == "doctype":
				84	j = self._parse_doctype_subset(j + 1, i)
				85	else:
				86	self.error("unexpected '[' char in declaration")
				87	else:
				88	self.error(
				89	"unexpected %s char in declaration" % `rawdata[j]`)
				90	if j < 0:
				91	return j
				92	return -1 # incomplete
				93
				94	# Internal -- scan past the internal subset in a <!DOCTYPE declaration,
				95	# returning the index just past any whitespace following the trailing ']'.
				96	def _parse_doctype_subset(self, i, declstartpos):
				97	rawdata = self.rawdata
				98	n = len(rawdata)
				99	j = i
				100	while j < n:
				101	c = rawdata[j]
				102	if c == "<":
				103	s = rawdata[j:j+2]
				104	if s == "<":
				105	# end of buffer; incomplete
				106	return -1
				107	if s != "<!":
				108	self.updatepos(declstartpos, j + 1)
				109	self.error("unexpected char in internal subset (in %s)"
				110	% `s`)
				111	if (j + 2) == n:
				112	# end of buffer; incomplete
				113	return -1
				114	if (j + 4) > n:
				115	# end of buffer; incomplete
				116	return -1
				117	if rawdata[j:j+4] == "<!--":
				118	j = self.parse_comment(j, report=0)
				119	if j < 0:
				120	return j
				121	continue
				122	name, j = self._scan_name(j + 2, declstartpos)
				123	if j == -1:
				124	return -1
				125	if name not in ("attlist", "element", "entity", "notation"):
				126	self.updatepos(declstartpos, j + 2)
				127	self.error(
				128	"unknown declaration %s in internal subset" % `name`)
				129	# handle the individual names
				130	meth = getattr(self, "_parse_doctype_" + name)
				131	j = meth(j, declstartpos)
				132	if j < 0:
				133	return j
				134	elif c == "%":
				135	# parameter entity reference
				136	if (j + 1) == n:
				137	# end of buffer; incomplete
				138	return -1
				139	s, j = self._scan_name(j + 1, declstartpos)
				140	if j < 0:
				141	return j
				142	if rawdata[j] == ";":
				143	j = j + 1
				144	elif c == "]":
				145	j = j + 1
				146	while j < n and rawdata[j] in string.whitespace:
				147	j = j + 1
				148	if j < n:
				149	if rawdata[j] == ">":
				150	return j
				151	self.updatepos(declstartpos, j)
				152	self.error("unexpected char after internal subset")
				153	else:
				154	return -1
				155	elif c in string.whitespace:
				156	j = j + 1
				157	else:
				158	self.updatepos(declstartpos, j)
				159	self.error("unexpected char %s in internal subset" % `c`)
				160	# end of buffer reached
				161	return -1
				162
				163	# Internal -- scan past <!ELEMENT declarations
				164	def _parse_doctype_element(self, i, declstartpos):
				165	rawdata = self.rawdata
				166	n = len(rawdata)
				167	name, j = self._scan_name(i, declstartpos)
				168	if j == -1:
				169	return -1
				170	# style content model; just skip until '>'
				171	if '>' in rawdata[j:]:
				172	return string.find(rawdata, ">", j) + 1
				173	return -1
				174
				175	# Internal -- scan past <!ATTLIST declarations
				176	def _parse_doctype_attlist(self, i, declstartpos):
				177	rawdata = self.rawdata
				178	name, j = self._scan_name(i, declstartpos)
				179	c = rawdata[j:j+1]
				180	if c == "":
				181	return -1
				182	if c == ">":
				183	return j + 1
				184	while 1:
				185	# scan a series of attribute descriptions; simplified:
				186	# name type [value] [#constraint]
				187	name, j = self._scan_name(j, declstartpos)
				188	if j < 0:
				189	return j
				190	c = rawdata[j:j+1]
				191	if c == "":
				192	return -1
				193	if c == "(":
				194	# an enumerated type; look for ')'
				195	if ")" in rawdata[j:]:
				196	j = string.find(rawdata, ")", j) + 1
				197	else:
				198	return -1
				199	while rawdata[j:j+1] in string.whitespace:
				200	j = j + 1
				201	if not rawdata[j:]:
				202	# end of buffer, incomplete
				203	return -1
				204	else:
				205	name, j = self._scan_name(j, declstartpos)
				206	c = rawdata[j:j+1]
				207	if not c:
				208	return -1
				209	if c in "'\"":
				210	m = _declstringlit_match(rawdata, j)
				211	if m:
				212	j = m.end()
				213	else:
				214	return -1
				215	c = rawdata[j:j+1]
				216	if not c:
				217	return -1
				218	if c == "#":
				219	if rawdata[j:] == "#":
				220	# end of buffer
				221	return -1
				222	name, j = self._scan_name(j + 1, declstartpos)
				223	if j < 0:
				224	return j
				225	c = rawdata[j:j+1]
				226	if not c:
				227	return -1
				228	if c == '>':
				229	# all done
				230	return j + 1
				231
				232	# Internal -- scan past <!NOTATION declarations
				233	def _parse_doctype_notation(self, i, declstartpos):
				234	name, j = self._scan_name(i, declstartpos)
				235	if j < 0:
				236	return j
				237	rawdata = self.rawdata
				238	while 1:
				239	c = rawdata[j:j+1]
				240	if not c:
				241	# end of buffer; incomplete
				242	return -1
				243	if c == '>':
				244	return j + 1
				245	if c in "'\"":
				246	m = _declstringlit_match(rawdata, j)
				247	if not m:
				248	return -1
				249	j = m.end()
				250	else:
				251	name, j = self._scan_name(j, declstartpos)
				252	if j < 0:
				253	return j
				254
				255	# Internal -- scan past <!ENTITY declarations
				256	def _parse_doctype_entity(self, i, declstartpos):
				257	rawdata = self.rawdata
				258	if rawdata[i:i+1] == "%":
				259	j = i + 1
				260	while 1:
				261	c = rawdata[j:j+1]
				262	if not c:
				263	return -1
				264	if c in string.whitespace:
				265	j = j + 1
				266	else:
				267	break
				268	else:
				269	j = i
				270	name, j = self._scan_name(j, declstartpos)
				271	if j < 0:
				272	return j
				273	while 1:
				274	c = self.rawdata[j:j+1]
				275	if not c:
				276	return -1
				277	if c in "'\"":
				278	m = _declstringlit_match(rawdata, j)
				279	if m:
				280	j = m.end()
				281	else:
				282	return -1 # incomplete
				283	elif c == ">":
				284	return j + 1
				285	else:
				286	name, j = self._scan_name(j, declstartpos)
				287	if j < 0:
				288	return j
				289
				290	# Internal -- scan a name token and the new position and the token, or
				291	# return -1 if we've reached the end of the buffer.
				292	def _scan_name(self, i, declstartpos):
				293	rawdata = self.rawdata
				294	n = len(rawdata)
				295	if i == n:
				296	return None, -1
				297	m = _declname_match(rawdata, i)
				298	if m:
				299	s = m.group()
				300	name = s.strip()
				301	if (i + len(s)) == n:
				302	return None, -1 # end of buffer
Fred Drake	1cffd5c	2001-09-24 20:04:29 +0000	[diff] [blame]	303	return string.lower(name), m.end()
Fred Drake	68f8a80	2001-09-24 20:01:28 +0000	[diff] [blame]	304	else:
				305	self.updatepos(declstartpos, i)
				306	self.error("expected name token", self.getpos())