blob: ddeb9835b80e5d76144c82a96cd394ccc9fe6fb8 [file] [log] [blame]
Fred Drake7ed44e52005-08-23 04:06:46 +00001"""Shared support for scanning document type declarations in HTML and XHTML.
2
3This module is used as a foundation for the HTMLParser and sgmllib
4modules (indirectly, for htmllib as well). It has no documented
5public API and should not be used directly.
6
7"""
Fred Drake68f8a802001-09-24 20:01:28 +00008
9import re
Fred Drake68f8a802001-09-24 20:01:28 +000010
11_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
12_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000013_commentclose = re.compile(r'--\s*>')
14_markedsectionclose = re.compile(r']\s*]\s*>')
15
16# An analysis of the MS-Word extensions is available at
17# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
18
19_msmarkedsectionclose = re.compile(r']\s*>')
Fred Drake68f8a802001-09-24 20:01:28 +000020
21del re
22
23
24class ParserBase:
25 """Parser base class which provides some common support methods used
26 by the SGML/HTML and XHTML parsers."""
27
Fred Drake5445f072001-10-26 18:02:28 +000028 def __init__(self):
29 if self.__class__ is ParserBase:
30 raise RuntimeError(
31 "markupbase.ParserBase must be subclassed")
32
33 def error(self, message):
34 raise NotImplementedError(
35 "subclasses of ParserBase must override error()")
36
Fred Drake68f8a802001-09-24 20:01:28 +000037 def reset(self):
38 self.lineno = 1
39 self.offset = 0
40
41 def getpos(self):
42 """Return current line number and offset."""
43 return self.lineno, self.offset
44
45 # Internal -- update line number and offset. This should be
46 # called for each piece of data exactly once, in order -- in other
47 # words the concatenation of all the input strings to this
48 # function should be exactly the entire input.
49 def updatepos(self, i, j):
50 if i >= j:
51 return j
52 rawdata = self.rawdata
Neal Norwitz7ce734c2002-05-31 14:13:04 +000053 nlines = rawdata.count("\n", i, j)
Fred Drake68f8a802001-09-24 20:01:28 +000054 if nlines:
55 self.lineno = self.lineno + nlines
Neal Norwitz7ce734c2002-05-31 14:13:04 +000056 pos = rawdata.rindex("\n", i, j) # Should not fail
Fred Drake68f8a802001-09-24 20:01:28 +000057 self.offset = j-(pos+1)
58 else:
59 self.offset = self.offset + j-i
60 return j
61
62 _decl_otherchars = ''
63
64 # Internal -- parse declaration (for use by subclasses).
65 def parse_declaration(self, i):
66 # This is some sort of declaration; in "HTML as
67 # deployed," this should only be the document type
68 # declaration ("<!DOCTYPE html...>").
Tim Peters0eadaac2003-04-24 16:02:54 +000069 # ISO 8879:1986, however, has more complex
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000070 # declaration syntax for elements in <!...>, including:
71 # --comment--
72 # [marked section]
Tim Peters0eadaac2003-04-24 16:02:54 +000073 # name in the following list: ENTITY, DOCTYPE, ELEMENT,
74 # ATTLIST, NOTATION, SHORTREF, USEMAP,
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000075 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
Fred Drake68f8a802001-09-24 20:01:28 +000076 rawdata = self.rawdata
Fred Drake68f8a802001-09-24 20:01:28 +000077 j = i + 2
78 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
Georg Brandld09def32006-03-09 13:27:14 +000079 if rawdata[j:j+1] == ">":
80 # the empty comment <!>
81 return j + 1
Fred Drake68f8a802001-09-24 20:01:28 +000082 if rawdata[j:j+1] in ("-", ""):
83 # Start of comment followed by buffer boundary,
84 # or just a buffer boundary.
85 return -1
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000086 # A simple, practical version could look like: ((name|stringlit) S*) + '>'
Fred Drake68f8a802001-09-24 20:01:28 +000087 n = len(rawdata)
Georg Brandld09def32006-03-09 13:27:14 +000088 if rawdata[j:j+2] == '--': #comment
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000089 # Locate --.*-- as the body of the comment
90 return self.parse_comment(i)
91 elif rawdata[j] == '[': #marked section
92 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
93 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
94 # Note that this is extended by Microsoft Office "Save as Web" function
95 # to include [if...] and [endif].
96 return self.parse_marked_section(i)
97 else: #all other declaration elements
98 decltype, j = self._scan_name(j, i)
Fred Drake68f8a802001-09-24 20:01:28 +000099 if j < 0:
100 return j
101 if decltype == "doctype":
102 self._decl_otherchars = ''
103 while j < n:
104 c = rawdata[j]
105 if c == ">":
106 # end of declaration syntax
107 data = rawdata[i+2:j]
108 if decltype == "doctype":
109 self.handle_decl(data)
110 else:
Ezio Melotti6b7003a2011-12-19 07:28:08 +0200111 # According to the HTML5 specs sections "8.2.4.44 Bogus
112 # comment state" and "8.2.4.45 Markup declaration open
113 # state", a comment token should be emitted.
114 # Calling unknown_decl provides more flexibility though.
Fred Drake68f8a802001-09-24 20:01:28 +0000115 self.unknown_decl(data)
116 return j + 1
117 if c in "\"'":
118 m = _declstringlit_match(rawdata, j)
119 if not m:
120 return -1 # incomplete
121 j = m.end()
122 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
123 name, j = self._scan_name(j, i)
124 elif c in self._decl_otherchars:
125 j = j + 1
126 elif c == "[":
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000127 # this could be handled in a separate doctype parser
Fred Drake68f8a802001-09-24 20:01:28 +0000128 if decltype == "doctype":
129 j = self._parse_doctype_subset(j + 1, i)
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000130 elif decltype in ("attlist", "linktype", "link", "element"):
131 # must tolerate []'d groups in a content model in an element declaration
132 # also in data attribute specifications of attlist declaration
133 # also link type declaration subsets in linktype declarations
134 # also link attribute specification lists in link declarations
135 self.error("unsupported '[' char in %s declaration" % decltype)
Fred Drake68f8a802001-09-24 20:01:28 +0000136 else:
137 self.error("unexpected '[' char in declaration")
138 else:
139 self.error(
Walter Dörwald70a6b492004-02-12 17:35:32 +0000140 "unexpected %r char in declaration" % rawdata[j])
Fred Drake68f8a802001-09-24 20:01:28 +0000141 if j < 0:
142 return j
143 return -1 # incomplete
144
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000145 # Internal -- parse a marked section
146 # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
Martin Blais215f13d2006-06-06 12:46:55 +0000147 def parse_marked_section(self, i, report=1):
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000148 rawdata= self.rawdata
149 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
150 sectName, j = self._scan_name( i+3, i )
151 if j < 0:
152 return j
153 if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
154 # look for standard ]]> ending
155 match= _markedsectionclose.search(rawdata, i+3)
156 elif sectName in ("if", "else", "endif"):
157 # look for MS Office ]> ending
158 match= _msmarkedsectionclose.search(rawdata, i+3)
159 else:
Walter Dörwald70a6b492004-02-12 17:35:32 +0000160 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000161 if not match:
162 return -1
163 if report:
164 j = match.start(0)
165 self.unknown_decl(rawdata[i+3: j])
166 return match.end(0)
Tim Peters0eadaac2003-04-24 16:02:54 +0000167
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000168 # Internal -- parse comment, return length or -1 if not terminated
169 def parse_comment(self, i, report=1):
170 rawdata = self.rawdata
171 if rawdata[i:i+4] != '<!--':
172 self.error('unexpected call to parse_comment()')
173 match = _commentclose.search(rawdata, i+4)
174 if not match:
175 return -1
176 if report:
177 j = match.start(0)
178 self.handle_comment(rawdata[i+4: j])
179 return match.end(0)
180
Fred Drake68f8a802001-09-24 20:01:28 +0000181 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
182 # returning the index just past any whitespace following the trailing ']'.
183 def _parse_doctype_subset(self, i, declstartpos):
184 rawdata = self.rawdata
185 n = len(rawdata)
186 j = i
187 while j < n:
188 c = rawdata[j]
189 if c == "<":
190 s = rawdata[j:j+2]
191 if s == "<":
192 # end of buffer; incomplete
193 return -1
194 if s != "<!":
195 self.updatepos(declstartpos, j + 1)
Walter Dörwald70a6b492004-02-12 17:35:32 +0000196 self.error("unexpected char in internal subset (in %r)" % s)
Fred Drake68f8a802001-09-24 20:01:28 +0000197 if (j + 2) == n:
198 # end of buffer; incomplete
199 return -1
200 if (j + 4) > n:
201 # end of buffer; incomplete
202 return -1
203 if rawdata[j:j+4] == "<!--":
204 j = self.parse_comment(j, report=0)
205 if j < 0:
206 return j
207 continue
208 name, j = self._scan_name(j + 2, declstartpos)
209 if j == -1:
210 return -1
211 if name not in ("attlist", "element", "entity", "notation"):
212 self.updatepos(declstartpos, j + 2)
213 self.error(
Walter Dörwald70a6b492004-02-12 17:35:32 +0000214 "unknown declaration %r in internal subset" % name)
Fred Drake68f8a802001-09-24 20:01:28 +0000215 # handle the individual names
216 meth = getattr(self, "_parse_doctype_" + name)
217 j = meth(j, declstartpos)
218 if j < 0:
219 return j
220 elif c == "%":
221 # parameter entity reference
222 if (j + 1) == n:
223 # end of buffer; incomplete
224 return -1
225 s, j = self._scan_name(j + 1, declstartpos)
226 if j < 0:
227 return j
228 if rawdata[j] == ";":
229 j = j + 1
230 elif c == "]":
231 j = j + 1
Walter Dörwald65230a22002-06-03 15:58:32 +0000232 while j < n and rawdata[j].isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000233 j = j + 1
234 if j < n:
235 if rawdata[j] == ">":
236 return j
237 self.updatepos(declstartpos, j)
238 self.error("unexpected char after internal subset")
239 else:
240 return -1
Walter Dörwald65230a22002-06-03 15:58:32 +0000241 elif c.isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000242 j = j + 1
243 else:
244 self.updatepos(declstartpos, j)
Walter Dörwald70a6b492004-02-12 17:35:32 +0000245 self.error("unexpected char %r in internal subset" % c)
Fred Drake68f8a802001-09-24 20:01:28 +0000246 # end of buffer reached
247 return -1
248
249 # Internal -- scan past <!ELEMENT declarations
250 def _parse_doctype_element(self, i, declstartpos):
Fred Drake68f8a802001-09-24 20:01:28 +0000251 name, j = self._scan_name(i, declstartpos)
252 if j == -1:
253 return -1
254 # style content model; just skip until '>'
Fred Drake5445f072001-10-26 18:02:28 +0000255 rawdata = self.rawdata
Fred Drake68f8a802001-09-24 20:01:28 +0000256 if '>' in rawdata[j:]:
Neal Norwitz7ce734c2002-05-31 14:13:04 +0000257 return rawdata.find(">", j) + 1
Fred Drake68f8a802001-09-24 20:01:28 +0000258 return -1
259
260 # Internal -- scan past <!ATTLIST declarations
261 def _parse_doctype_attlist(self, i, declstartpos):
262 rawdata = self.rawdata
263 name, j = self._scan_name(i, declstartpos)
264 c = rawdata[j:j+1]
265 if c == "":
266 return -1
267 if c == ">":
268 return j + 1
269 while 1:
270 # scan a series of attribute descriptions; simplified:
271 # name type [value] [#constraint]
272 name, j = self._scan_name(j, declstartpos)
273 if j < 0:
274 return j
275 c = rawdata[j:j+1]
276 if c == "":
277 return -1
278 if c == "(":
279 # an enumerated type; look for ')'
280 if ")" in rawdata[j:]:
Neal Norwitz7ce734c2002-05-31 14:13:04 +0000281 j = rawdata.find(")", j) + 1
Fred Drake68f8a802001-09-24 20:01:28 +0000282 else:
283 return -1
Walter Dörwald65230a22002-06-03 15:58:32 +0000284 while rawdata[j:j+1].isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000285 j = j + 1
286 if not rawdata[j:]:
287 # end of buffer, incomplete
288 return -1
289 else:
290 name, j = self._scan_name(j, declstartpos)
291 c = rawdata[j:j+1]
292 if not c:
293 return -1
294 if c in "'\"":
295 m = _declstringlit_match(rawdata, j)
296 if m:
297 j = m.end()
298 else:
299 return -1
300 c = rawdata[j:j+1]
301 if not c:
302 return -1
303 if c == "#":
304 if rawdata[j:] == "#":
305 # end of buffer
306 return -1
307 name, j = self._scan_name(j + 1, declstartpos)
308 if j < 0:
309 return j
310 c = rawdata[j:j+1]
311 if not c:
312 return -1
313 if c == '>':
314 # all done
315 return j + 1
316
317 # Internal -- scan past <!NOTATION declarations
318 def _parse_doctype_notation(self, i, declstartpos):
319 name, j = self._scan_name(i, declstartpos)
320 if j < 0:
321 return j
322 rawdata = self.rawdata
323 while 1:
324 c = rawdata[j:j+1]
325 if not c:
326 # end of buffer; incomplete
327 return -1
328 if c == '>':
329 return j + 1
330 if c in "'\"":
331 m = _declstringlit_match(rawdata, j)
332 if not m:
333 return -1
334 j = m.end()
335 else:
336 name, j = self._scan_name(j, declstartpos)
337 if j < 0:
338 return j
339
340 # Internal -- scan past <!ENTITY declarations
341 def _parse_doctype_entity(self, i, declstartpos):
342 rawdata = self.rawdata
343 if rawdata[i:i+1] == "%":
344 j = i + 1
345 while 1:
346 c = rawdata[j:j+1]
347 if not c:
348 return -1
Walter Dörwald65230a22002-06-03 15:58:32 +0000349 if c.isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000350 j = j + 1
351 else:
352 break
353 else:
354 j = i
355 name, j = self._scan_name(j, declstartpos)
356 if j < 0:
357 return j
358 while 1:
359 c = self.rawdata[j:j+1]
360 if not c:
361 return -1
362 if c in "'\"":
363 m = _declstringlit_match(rawdata, j)
364 if m:
365 j = m.end()
366 else:
367 return -1 # incomplete
368 elif c == ">":
369 return j + 1
370 else:
371 name, j = self._scan_name(j, declstartpos)
372 if j < 0:
373 return j
374
375 # Internal -- scan a name token and the new position and the token, or
376 # return -1 if we've reached the end of the buffer.
377 def _scan_name(self, i, declstartpos):
378 rawdata = self.rawdata
379 n = len(rawdata)
380 if i == n:
381 return None, -1
382 m = _declname_match(rawdata, i)
383 if m:
384 s = m.group()
385 name = s.strip()
386 if (i + len(s)) == n:
387 return None, -1 # end of buffer
Neal Norwitz7ce734c2002-05-31 14:13:04 +0000388 return name.lower(), m.end()
Fred Drake68f8a802001-09-24 20:01:28 +0000389 else:
390 self.updatepos(declstartpos, i)
Andrew M. Kuchlingf027ca82004-07-10 21:49:45 +0000391 self.error("expected name token at %r"
392 % rawdata[declstartpos:declstartpos+20])
Fred Drake5445f072001-10-26 18:02:28 +0000393
394 # To be overridden -- handlers for unknown objects
395 def unknown_decl(self, data):
396 pass