blob: acd07266707bcca2145c4b824616ab107a4e4781 [file] [log] [blame]
Fred Drake68f8a802001-09-24 20:01:28 +00001"""Shared support for scanning document type declarations in HTML and XHTML."""
2
3import re
Fred Drake68f8a802001-09-24 20:01:28 +00004
5_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
6_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
7
8del re
9
10
11class ParserBase:
12 """Parser base class which provides some common support methods used
13 by the SGML/HTML and XHTML parsers."""
14
Fred Drake5445f072001-10-26 18:02:28 +000015 def __init__(self):
16 if self.__class__ is ParserBase:
17 raise RuntimeError(
18 "markupbase.ParserBase must be subclassed")
19
20 def error(self, message):
21 raise NotImplementedError(
22 "subclasses of ParserBase must override error()")
23
Fred Drake68f8a802001-09-24 20:01:28 +000024 def reset(self):
25 self.lineno = 1
26 self.offset = 0
27
28 def getpos(self):
29 """Return current line number and offset."""
30 return self.lineno, self.offset
31
32 # Internal -- update line number and offset. This should be
33 # called for each piece of data exactly once, in order -- in other
34 # words the concatenation of all the input strings to this
35 # function should be exactly the entire input.
36 def updatepos(self, i, j):
37 if i >= j:
38 return j
39 rawdata = self.rawdata
Neal Norwitz7ce734c2002-05-31 14:13:04 +000040 nlines = rawdata.count("\n", i, j)
Fred Drake68f8a802001-09-24 20:01:28 +000041 if nlines:
42 self.lineno = self.lineno + nlines
Neal Norwitz7ce734c2002-05-31 14:13:04 +000043 pos = rawdata.rindex("\n", i, j) # Should not fail
Fred Drake68f8a802001-09-24 20:01:28 +000044 self.offset = j-(pos+1)
45 else:
46 self.offset = self.offset + j-i
47 return j
48
49 _decl_otherchars = ''
50
51 # Internal -- parse declaration (for use by subclasses).
52 def parse_declaration(self, i):
53 # This is some sort of declaration; in "HTML as
54 # deployed," this should only be the document type
55 # declaration ("<!DOCTYPE html...>").
56 rawdata = self.rawdata
Fred Drake68f8a802001-09-24 20:01:28 +000057 j = i + 2
58 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
59 if rawdata[j:j+1] in ("-", ""):
60 # Start of comment followed by buffer boundary,
61 # or just a buffer boundary.
62 return -1
63 # in practice, this should look like: ((name|stringlit) S*)+ '>'
64 n = len(rawdata)
65 decltype, j = self._scan_name(j, i)
66 if j < 0:
67 return j
68 if decltype == "doctype":
69 self._decl_otherchars = ''
70 while j < n:
71 c = rawdata[j]
72 if c == ">":
73 # end of declaration syntax
74 data = rawdata[i+2:j]
75 if decltype == "doctype":
76 self.handle_decl(data)
77 else:
78 self.unknown_decl(data)
79 return j + 1
80 if c in "\"'":
81 m = _declstringlit_match(rawdata, j)
82 if not m:
83 return -1 # incomplete
84 j = m.end()
85 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
86 name, j = self._scan_name(j, i)
87 elif c in self._decl_otherchars:
88 j = j + 1
89 elif c == "[":
90 if decltype == "doctype":
91 j = self._parse_doctype_subset(j + 1, i)
92 else:
93 self.error("unexpected '[' char in declaration")
94 else:
95 self.error(
96 "unexpected %s char in declaration" % `rawdata[j]`)
97 if j < 0:
98 return j
99 return -1 # incomplete
100
101 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
102 # returning the index just past any whitespace following the trailing ']'.
103 def _parse_doctype_subset(self, i, declstartpos):
104 rawdata = self.rawdata
105 n = len(rawdata)
106 j = i
107 while j < n:
108 c = rawdata[j]
109 if c == "<":
110 s = rawdata[j:j+2]
111 if s == "<":
112 # end of buffer; incomplete
113 return -1
114 if s != "<!":
115 self.updatepos(declstartpos, j + 1)
116 self.error("unexpected char in internal subset (in %s)"
117 % `s`)
118 if (j + 2) == n:
119 # end of buffer; incomplete
120 return -1
121 if (j + 4) > n:
122 # end of buffer; incomplete
123 return -1
124 if rawdata[j:j+4] == "<!--":
125 j = self.parse_comment(j, report=0)
126 if j < 0:
127 return j
128 continue
129 name, j = self._scan_name(j + 2, declstartpos)
130 if j == -1:
131 return -1
132 if name not in ("attlist", "element", "entity", "notation"):
133 self.updatepos(declstartpos, j + 2)
134 self.error(
135 "unknown declaration %s in internal subset" % `name`)
136 # handle the individual names
137 meth = getattr(self, "_parse_doctype_" + name)
138 j = meth(j, declstartpos)
139 if j < 0:
140 return j
141 elif c == "%":
142 # parameter entity reference
143 if (j + 1) == n:
144 # end of buffer; incomplete
145 return -1
146 s, j = self._scan_name(j + 1, declstartpos)
147 if j < 0:
148 return j
149 if rawdata[j] == ";":
150 j = j + 1
151 elif c == "]":
152 j = j + 1
Walter Dörwald65230a22002-06-03 15:58:32 +0000153 while j < n and rawdata[j].isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000154 j = j + 1
155 if j < n:
156 if rawdata[j] == ">":
157 return j
158 self.updatepos(declstartpos, j)
159 self.error("unexpected char after internal subset")
160 else:
161 return -1
Walter Dörwald65230a22002-06-03 15:58:32 +0000162 elif c.isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000163 j = j + 1
164 else:
165 self.updatepos(declstartpos, j)
166 self.error("unexpected char %s in internal subset" % `c`)
167 # end of buffer reached
168 return -1
169
170 # Internal -- scan past <!ELEMENT declarations
171 def _parse_doctype_element(self, i, declstartpos):
Fred Drake68f8a802001-09-24 20:01:28 +0000172 name, j = self._scan_name(i, declstartpos)
173 if j == -1:
174 return -1
175 # style content model; just skip until '>'
Fred Drake5445f072001-10-26 18:02:28 +0000176 rawdata = self.rawdata
Fred Drake68f8a802001-09-24 20:01:28 +0000177 if '>' in rawdata[j:]:
Neal Norwitz7ce734c2002-05-31 14:13:04 +0000178 return rawdata.find(">", j) + 1
Fred Drake68f8a802001-09-24 20:01:28 +0000179 return -1
180
181 # Internal -- scan past <!ATTLIST declarations
182 def _parse_doctype_attlist(self, i, declstartpos):
183 rawdata = self.rawdata
184 name, j = self._scan_name(i, declstartpos)
185 c = rawdata[j:j+1]
186 if c == "":
187 return -1
188 if c == ">":
189 return j + 1
190 while 1:
191 # scan a series of attribute descriptions; simplified:
192 # name type [value] [#constraint]
193 name, j = self._scan_name(j, declstartpos)
194 if j < 0:
195 return j
196 c = rawdata[j:j+1]
197 if c == "":
198 return -1
199 if c == "(":
200 # an enumerated type; look for ')'
201 if ")" in rawdata[j:]:
Neal Norwitz7ce734c2002-05-31 14:13:04 +0000202 j = rawdata.find(")", j) + 1
Fred Drake68f8a802001-09-24 20:01:28 +0000203 else:
204 return -1
Walter Dörwald65230a22002-06-03 15:58:32 +0000205 while rawdata[j:j+1].isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000206 j = j + 1
207 if not rawdata[j:]:
208 # end of buffer, incomplete
209 return -1
210 else:
211 name, j = self._scan_name(j, declstartpos)
212 c = rawdata[j:j+1]
213 if not c:
214 return -1
215 if c in "'\"":
216 m = _declstringlit_match(rawdata, j)
217 if m:
218 j = m.end()
219 else:
220 return -1
221 c = rawdata[j:j+1]
222 if not c:
223 return -1
224 if c == "#":
225 if rawdata[j:] == "#":
226 # end of buffer
227 return -1
228 name, j = self._scan_name(j + 1, declstartpos)
229 if j < 0:
230 return j
231 c = rawdata[j:j+1]
232 if not c:
233 return -1
234 if c == '>':
235 # all done
236 return j + 1
237
238 # Internal -- scan past <!NOTATION declarations
239 def _parse_doctype_notation(self, i, declstartpos):
240 name, j = self._scan_name(i, declstartpos)
241 if j < 0:
242 return j
243 rawdata = self.rawdata
244 while 1:
245 c = rawdata[j:j+1]
246 if not c:
247 # end of buffer; incomplete
248 return -1
249 if c == '>':
250 return j + 1
251 if c in "'\"":
252 m = _declstringlit_match(rawdata, j)
253 if not m:
254 return -1
255 j = m.end()
256 else:
257 name, j = self._scan_name(j, declstartpos)
258 if j < 0:
259 return j
260
261 # Internal -- scan past <!ENTITY declarations
262 def _parse_doctype_entity(self, i, declstartpos):
263 rawdata = self.rawdata
264 if rawdata[i:i+1] == "%":
265 j = i + 1
266 while 1:
267 c = rawdata[j:j+1]
268 if not c:
269 return -1
Walter Dörwald65230a22002-06-03 15:58:32 +0000270 if c.isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000271 j = j + 1
272 else:
273 break
274 else:
275 j = i
276 name, j = self._scan_name(j, declstartpos)
277 if j < 0:
278 return j
279 while 1:
280 c = self.rawdata[j:j+1]
281 if not c:
282 return -1
283 if c in "'\"":
284 m = _declstringlit_match(rawdata, j)
285 if m:
286 j = m.end()
287 else:
288 return -1 # incomplete
289 elif c == ">":
290 return j + 1
291 else:
292 name, j = self._scan_name(j, declstartpos)
293 if j < 0:
294 return j
295
296 # Internal -- scan a name token and the new position and the token, or
297 # return -1 if we've reached the end of the buffer.
298 def _scan_name(self, i, declstartpos):
299 rawdata = self.rawdata
300 n = len(rawdata)
301 if i == n:
302 return None, -1
303 m = _declname_match(rawdata, i)
304 if m:
305 s = m.group()
306 name = s.strip()
307 if (i + len(s)) == n:
308 return None, -1 # end of buffer
Neal Norwitz7ce734c2002-05-31 14:13:04 +0000309 return name.lower(), m.end()
Fred Drake68f8a802001-09-24 20:01:28 +0000310 else:
311 self.updatepos(declstartpos, i)
Fred Drake3d32be12001-10-13 15:59:47 +0000312 self.error("expected name token")
Fred Drake5445f072001-10-26 18:02:28 +0000313
314 # To be overridden -- handlers for unknown objects
315 def unknown_decl(self, data):
316 pass