blob: 57d3ae4b3ce110276a8bd2153f7be04be3a34f84 [file] [log] [blame]
Fred Drake68f8a802001-09-24 20:01:28 +00001"""Shared support for scanning document type declarations in HTML and XHTML."""
2
3import re
4import string
5
6_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
7_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
8
9del re
10
11
12class ParserBase:
13 """Parser base class which provides some common support methods used
14 by the SGML/HTML and XHTML parsers."""
15
Fred Drake5445f072001-10-26 18:02:28 +000016 def __init__(self):
17 if self.__class__ is ParserBase:
18 raise RuntimeError(
19 "markupbase.ParserBase must be subclassed")
20
21 def error(self, message):
22 raise NotImplementedError(
23 "subclasses of ParserBase must override error()")
24
Fred Drake68f8a802001-09-24 20:01:28 +000025 def reset(self):
26 self.lineno = 1
27 self.offset = 0
28
29 def getpos(self):
30 """Return current line number and offset."""
31 return self.lineno, self.offset
32
33 # Internal -- update line number and offset. This should be
34 # called for each piece of data exactly once, in order -- in other
35 # words the concatenation of all the input strings to this
36 # function should be exactly the entire input.
37 def updatepos(self, i, j):
38 if i >= j:
39 return j
40 rawdata = self.rawdata
41 nlines = string.count(rawdata, "\n", i, j)
42 if nlines:
43 self.lineno = self.lineno + nlines
44 pos = string.rindex(rawdata, "\n", i, j) # Should not fail
45 self.offset = j-(pos+1)
46 else:
47 self.offset = self.offset + j-i
48 return j
49
50 _decl_otherchars = ''
51
52 # Internal -- parse declaration (for use by subclasses).
53 def parse_declaration(self, i):
54 # This is some sort of declaration; in "HTML as
55 # deployed," this should only be the document type
56 # declaration ("<!DOCTYPE html...>").
57 rawdata = self.rawdata
Fred Drake68f8a802001-09-24 20:01:28 +000058 j = i + 2
59 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
60 if rawdata[j:j+1] in ("-", ""):
61 # Start of comment followed by buffer boundary,
62 # or just a buffer boundary.
63 return -1
64 # in practice, this should look like: ((name|stringlit) S*)+ '>'
65 n = len(rawdata)
66 decltype, j = self._scan_name(j, i)
67 if j < 0:
68 return j
69 if decltype == "doctype":
70 self._decl_otherchars = ''
71 while j < n:
72 c = rawdata[j]
73 if c == ">":
74 # end of declaration syntax
75 data = rawdata[i+2:j]
76 if decltype == "doctype":
77 self.handle_decl(data)
78 else:
79 self.unknown_decl(data)
80 return j + 1
81 if c in "\"'":
82 m = _declstringlit_match(rawdata, j)
83 if not m:
84 return -1 # incomplete
85 j = m.end()
86 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
87 name, j = self._scan_name(j, i)
88 elif c in self._decl_otherchars:
89 j = j + 1
90 elif c == "[":
91 if decltype == "doctype":
92 j = self._parse_doctype_subset(j + 1, i)
93 else:
94 self.error("unexpected '[' char in declaration")
95 else:
96 self.error(
97 "unexpected %s char in declaration" % `rawdata[j]`)
98 if j < 0:
99 return j
100 return -1 # incomplete
101
102 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
103 # returning the index just past any whitespace following the trailing ']'.
104 def _parse_doctype_subset(self, i, declstartpos):
105 rawdata = self.rawdata
106 n = len(rawdata)
107 j = i
108 while j < n:
109 c = rawdata[j]
110 if c == "<":
111 s = rawdata[j:j+2]
112 if s == "<":
113 # end of buffer; incomplete
114 return -1
115 if s != "<!":
116 self.updatepos(declstartpos, j + 1)
117 self.error("unexpected char in internal subset (in %s)"
118 % `s`)
119 if (j + 2) == n:
120 # end of buffer; incomplete
121 return -1
122 if (j + 4) > n:
123 # end of buffer; incomplete
124 return -1
125 if rawdata[j:j+4] == "<!--":
126 j = self.parse_comment(j, report=0)
127 if j < 0:
128 return j
129 continue
130 name, j = self._scan_name(j + 2, declstartpos)
131 if j == -1:
132 return -1
133 if name not in ("attlist", "element", "entity", "notation"):
134 self.updatepos(declstartpos, j + 2)
135 self.error(
136 "unknown declaration %s in internal subset" % `name`)
137 # handle the individual names
138 meth = getattr(self, "_parse_doctype_" + name)
139 j = meth(j, declstartpos)
140 if j < 0:
141 return j
142 elif c == "%":
143 # parameter entity reference
144 if (j + 1) == n:
145 # end of buffer; incomplete
146 return -1
147 s, j = self._scan_name(j + 1, declstartpos)
148 if j < 0:
149 return j
150 if rawdata[j] == ";":
151 j = j + 1
152 elif c == "]":
153 j = j + 1
154 while j < n and rawdata[j] in string.whitespace:
155 j = j + 1
156 if j < n:
157 if rawdata[j] == ">":
158 return j
159 self.updatepos(declstartpos, j)
160 self.error("unexpected char after internal subset")
161 else:
162 return -1
163 elif c in string.whitespace:
164 j = j + 1
165 else:
166 self.updatepos(declstartpos, j)
167 self.error("unexpected char %s in internal subset" % `c`)
168 # end of buffer reached
169 return -1
170
171 # Internal -- scan past <!ELEMENT declarations
172 def _parse_doctype_element(self, i, declstartpos):
Fred Drake68f8a802001-09-24 20:01:28 +0000173 name, j = self._scan_name(i, declstartpos)
174 if j == -1:
175 return -1
176 # style content model; just skip until '>'
Fred Drake5445f072001-10-26 18:02:28 +0000177 rawdata = self.rawdata
Fred Drake68f8a802001-09-24 20:01:28 +0000178 if '>' in rawdata[j:]:
179 return string.find(rawdata, ">", j) + 1
180 return -1
181
182 # Internal -- scan past <!ATTLIST declarations
183 def _parse_doctype_attlist(self, i, declstartpos):
184 rawdata = self.rawdata
185 name, j = self._scan_name(i, declstartpos)
186 c = rawdata[j:j+1]
187 if c == "":
188 return -1
189 if c == ">":
190 return j + 1
191 while 1:
192 # scan a series of attribute descriptions; simplified:
193 # name type [value] [#constraint]
194 name, j = self._scan_name(j, declstartpos)
195 if j < 0:
196 return j
197 c = rawdata[j:j+1]
198 if c == "":
199 return -1
200 if c == "(":
201 # an enumerated type; look for ')'
202 if ")" in rawdata[j:]:
203 j = string.find(rawdata, ")", j) + 1
204 else:
205 return -1
206 while rawdata[j:j+1] in string.whitespace:
207 j = j + 1
208 if not rawdata[j:]:
209 # end of buffer, incomplete
210 return -1
211 else:
212 name, j = self._scan_name(j, declstartpos)
213 c = rawdata[j:j+1]
214 if not c:
215 return -1
216 if c in "'\"":
217 m = _declstringlit_match(rawdata, j)
218 if m:
219 j = m.end()
220 else:
221 return -1
222 c = rawdata[j:j+1]
223 if not c:
224 return -1
225 if c == "#":
226 if rawdata[j:] == "#":
227 # end of buffer
228 return -1
229 name, j = self._scan_name(j + 1, declstartpos)
230 if j < 0:
231 return j
232 c = rawdata[j:j+1]
233 if not c:
234 return -1
235 if c == '>':
236 # all done
237 return j + 1
238
239 # Internal -- scan past <!NOTATION declarations
240 def _parse_doctype_notation(self, i, declstartpos):
241 name, j = self._scan_name(i, declstartpos)
242 if j < 0:
243 return j
244 rawdata = self.rawdata
245 while 1:
246 c = rawdata[j:j+1]
247 if not c:
248 # end of buffer; incomplete
249 return -1
250 if c == '>':
251 return j + 1
252 if c in "'\"":
253 m = _declstringlit_match(rawdata, j)
254 if not m:
255 return -1
256 j = m.end()
257 else:
258 name, j = self._scan_name(j, declstartpos)
259 if j < 0:
260 return j
261
262 # Internal -- scan past <!ENTITY declarations
263 def _parse_doctype_entity(self, i, declstartpos):
264 rawdata = self.rawdata
265 if rawdata[i:i+1] == "%":
266 j = i + 1
267 while 1:
268 c = rawdata[j:j+1]
269 if not c:
270 return -1
271 if c in string.whitespace:
272 j = j + 1
273 else:
274 break
275 else:
276 j = i
277 name, j = self._scan_name(j, declstartpos)
278 if j < 0:
279 return j
280 while 1:
281 c = self.rawdata[j:j+1]
282 if not c:
283 return -1
284 if c in "'\"":
285 m = _declstringlit_match(rawdata, j)
286 if m:
287 j = m.end()
288 else:
289 return -1 # incomplete
290 elif c == ">":
291 return j + 1
292 else:
293 name, j = self._scan_name(j, declstartpos)
294 if j < 0:
295 return j
296
297 # Internal -- scan a name token and the new position and the token, or
298 # return -1 if we've reached the end of the buffer.
299 def _scan_name(self, i, declstartpos):
300 rawdata = self.rawdata
301 n = len(rawdata)
302 if i == n:
303 return None, -1
304 m = _declname_match(rawdata, i)
305 if m:
306 s = m.group()
307 name = s.strip()
308 if (i + len(s)) == n:
309 return None, -1 # end of buffer
Fred Drake1cffd5c2001-09-24 20:04:29 +0000310 return string.lower(name), m.end()
Fred Drake68f8a802001-09-24 20:01:28 +0000311 else:
312 self.updatepos(declstartpos, i)
Fred Drake3d32be12001-10-13 15:59:47 +0000313 self.error("expected name token")
Fred Drake5445f072001-10-26 18:02:28 +0000314
315 # To be overridden -- handlers for unknown objects
316 def unknown_decl(self, data):
317 pass