blob: 2af5f1c23b6066284938fb1cc697bc1fc2fea6ea [file] [log] [blame]
Fred Drake7ed44e52005-08-23 04:06:46 +00001"""Shared support for scanning document type declarations in HTML and XHTML.
2
Georg Brandl877b10a2008-06-01 21:25:55 +00003This module is used as a foundation for the html.parser module. It has no
4documented public API and should not be used directly.
Fred Drake7ed44e52005-08-23 04:06:46 +00005
6"""
Fred Drake68f8a802001-09-24 20:01:28 +00007
8import re
Fred Drake68f8a802001-09-24 20:01:28 +00009
10_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
11_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000012_commentclose = re.compile(r'--\s*>')
13_markedsectionclose = re.compile(r']\s*]\s*>')
14
15# An analysis of the MS-Word extensions is available at
16# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
17
18_msmarkedsectionclose = re.compile(r']\s*>')
Fred Drake68f8a802001-09-24 20:01:28 +000019
20del re
21
22
23class ParserBase:
24 """Parser base class which provides some common support methods used
25 by the SGML/HTML and XHTML parsers."""
26
Fred Drake5445f072001-10-26 18:02:28 +000027 def __init__(self):
28 if self.__class__ is ParserBase:
29 raise RuntimeError(
Fred Drakecb5c80f2007-12-07 11:10:11 +000030 "_markupbase.ParserBase must be subclassed")
Fred Drake5445f072001-10-26 18:02:28 +000031
32 def error(self, message):
33 raise NotImplementedError(
34 "subclasses of ParserBase must override error()")
35
Fred Drake68f8a802001-09-24 20:01:28 +000036 def reset(self):
37 self.lineno = 1
38 self.offset = 0
39
40 def getpos(self):
41 """Return current line number and offset."""
42 return self.lineno, self.offset
43
44 # Internal -- update line number and offset. This should be
45 # called for each piece of data exactly once, in order -- in other
46 # words the concatenation of all the input strings to this
47 # function should be exactly the entire input.
48 def updatepos(self, i, j):
49 if i >= j:
50 return j
51 rawdata = self.rawdata
Neal Norwitz7ce734c2002-05-31 14:13:04 +000052 nlines = rawdata.count("\n", i, j)
Fred Drake68f8a802001-09-24 20:01:28 +000053 if nlines:
54 self.lineno = self.lineno + nlines
Neal Norwitz7ce734c2002-05-31 14:13:04 +000055 pos = rawdata.rindex("\n", i, j) # Should not fail
Fred Drake68f8a802001-09-24 20:01:28 +000056 self.offset = j-(pos+1)
57 else:
58 self.offset = self.offset + j-i
59 return j
60
61 _decl_otherchars = ''
62
63 # Internal -- parse declaration (for use by subclasses).
64 def parse_declaration(self, i):
65 # This is some sort of declaration; in "HTML as
66 # deployed," this should only be the document type
67 # declaration ("<!DOCTYPE html...>").
Tim Peters0eadaac2003-04-24 16:02:54 +000068 # ISO 8879:1986, however, has more complex
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000069 # declaration syntax for elements in <!...>, including:
70 # --comment--
71 # [marked section]
Tim Peters0eadaac2003-04-24 16:02:54 +000072 # name in the following list: ENTITY, DOCTYPE, ELEMENT,
73 # ATTLIST, NOTATION, SHORTREF, USEMAP,
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000074 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
Fred Drake68f8a802001-09-24 20:01:28 +000075 rawdata = self.rawdata
Fred Drake68f8a802001-09-24 20:01:28 +000076 j = i + 2
77 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
Georg Brandld09def32006-03-09 13:27:14 +000078 if rawdata[j:j+1] == ">":
79 # the empty comment <!>
80 return j + 1
Fred Drake68f8a802001-09-24 20:01:28 +000081 if rawdata[j:j+1] in ("-", ""):
82 # Start of comment followed by buffer boundary,
83 # or just a buffer boundary.
84 return -1
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000085 # A simple, practical version could look like: ((name|stringlit) S*) + '>'
Fred Drake68f8a802001-09-24 20:01:28 +000086 n = len(rawdata)
Georg Brandld09def32006-03-09 13:27:14 +000087 if rawdata[j:j+2] == '--': #comment
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000088 # Locate --.*-- as the body of the comment
89 return self.parse_comment(i)
90 elif rawdata[j] == '[': #marked section
91 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
92 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
93 # Note that this is extended by Microsoft Office "Save as Web" function
94 # to include [if...] and [endif].
95 return self.parse_marked_section(i)
96 else: #all other declaration elements
97 decltype, j = self._scan_name(j, i)
Fred Drake68f8a802001-09-24 20:01:28 +000098 if j < 0:
99 return j
100 if decltype == "doctype":
101 self._decl_otherchars = ''
102 while j < n:
103 c = rawdata[j]
104 if c == ">":
105 # end of declaration syntax
106 data = rawdata[i+2:j]
107 if decltype == "doctype":
108 self.handle_decl(data)
109 else:
Ezio Melotti62f3d032011-12-19 07:29:03 +0200110 # According to the HTML5 specs sections "8.2.4.44 Bogus
111 # comment state" and "8.2.4.45 Markup declaration open
112 # state", a comment token should be emitted.
113 # Calling unknown_decl provides more flexibility though.
Fred Drake68f8a802001-09-24 20:01:28 +0000114 self.unknown_decl(data)
115 return j + 1
116 if c in "\"'":
117 m = _declstringlit_match(rawdata, j)
118 if not m:
119 return -1 # incomplete
120 j = m.end()
121 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
122 name, j = self._scan_name(j, i)
123 elif c in self._decl_otherchars:
124 j = j + 1
125 elif c == "[":
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000126 # this could be handled in a separate doctype parser
Fred Drake68f8a802001-09-24 20:01:28 +0000127 if decltype == "doctype":
128 j = self._parse_doctype_subset(j + 1, i)
Raymond Hettingerd5825cc2010-09-05 23:15:06 +0000129 elif decltype in {"attlist", "linktype", "link", "element"}:
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000130 # must tolerate []'d groups in a content model in an element declaration
131 # also in data attribute specifications of attlist declaration
132 # also link type declaration subsets in linktype declarations
133 # also link attribute specification lists in link declarations
134 self.error("unsupported '[' char in %s declaration" % decltype)
Fred Drake68f8a802001-09-24 20:01:28 +0000135 else:
136 self.error("unexpected '[' char in declaration")
137 else:
138 self.error(
Walter Dörwald70a6b492004-02-12 17:35:32 +0000139 "unexpected %r char in declaration" % rawdata[j])
Fred Drake68f8a802001-09-24 20:01:28 +0000140 if j < 0:
141 return j
142 return -1 # incomplete
143
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000144 # Internal -- parse a marked section
145 # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000146 def parse_marked_section(self, i, report=1):
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000147 rawdata= self.rawdata
148 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
149 sectName, j = self._scan_name( i+3, i )
150 if j < 0:
151 return j
Raymond Hettingerd5825cc2010-09-05 23:15:06 +0000152 if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}:
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000153 # look for standard ]]> ending
154 match= _markedsectionclose.search(rawdata, i+3)
Raymond Hettingerd5825cc2010-09-05 23:15:06 +0000155 elif sectName in {"if", "else", "endif"}:
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000156 # look for MS Office ]> ending
157 match= _msmarkedsectionclose.search(rawdata, i+3)
158 else:
Walter Dörwald70a6b492004-02-12 17:35:32 +0000159 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000160 if not match:
161 return -1
162 if report:
163 j = match.start(0)
164 self.unknown_decl(rawdata[i+3: j])
165 return match.end(0)
Tim Peters0eadaac2003-04-24 16:02:54 +0000166
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000167 # Internal -- parse comment, return length or -1 if not terminated
168 def parse_comment(self, i, report=1):
169 rawdata = self.rawdata
170 if rawdata[i:i+4] != '<!--':
171 self.error('unexpected call to parse_comment()')
172 match = _commentclose.search(rawdata, i+4)
173 if not match:
174 return -1
175 if report:
176 j = match.start(0)
177 self.handle_comment(rawdata[i+4: j])
178 return match.end(0)
179
Fred Drake68f8a802001-09-24 20:01:28 +0000180 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
181 # returning the index just past any whitespace following the trailing ']'.
182 def _parse_doctype_subset(self, i, declstartpos):
183 rawdata = self.rawdata
184 n = len(rawdata)
185 j = i
186 while j < n:
187 c = rawdata[j]
188 if c == "<":
189 s = rawdata[j:j+2]
190 if s == "<":
191 # end of buffer; incomplete
192 return -1
193 if s != "<!":
194 self.updatepos(declstartpos, j + 1)
Walter Dörwald70a6b492004-02-12 17:35:32 +0000195 self.error("unexpected char in internal subset (in %r)" % s)
Fred Drake68f8a802001-09-24 20:01:28 +0000196 if (j + 2) == n:
197 # end of buffer; incomplete
198 return -1
199 if (j + 4) > n:
200 # end of buffer; incomplete
201 return -1
202 if rawdata[j:j+4] == "<!--":
203 j = self.parse_comment(j, report=0)
204 if j < 0:
205 return j
206 continue
207 name, j = self._scan_name(j + 2, declstartpos)
208 if j == -1:
209 return -1
Raymond Hettingerd5825cc2010-09-05 23:15:06 +0000210 if name not in {"attlist", "element", "entity", "notation"}:
Fred Drake68f8a802001-09-24 20:01:28 +0000211 self.updatepos(declstartpos, j + 2)
212 self.error(
Walter Dörwald70a6b492004-02-12 17:35:32 +0000213 "unknown declaration %r in internal subset" % name)
Fred Drake68f8a802001-09-24 20:01:28 +0000214 # handle the individual names
215 meth = getattr(self, "_parse_doctype_" + name)
216 j = meth(j, declstartpos)
217 if j < 0:
218 return j
219 elif c == "%":
220 # parameter entity reference
221 if (j + 1) == n:
222 # end of buffer; incomplete
223 return -1
224 s, j = self._scan_name(j + 1, declstartpos)
225 if j < 0:
226 return j
227 if rawdata[j] == ";":
228 j = j + 1
229 elif c == "]":
230 j = j + 1
Walter Dörwald65230a22002-06-03 15:58:32 +0000231 while j < n and rawdata[j].isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000232 j = j + 1
233 if j < n:
234 if rawdata[j] == ">":
235 return j
236 self.updatepos(declstartpos, j)
237 self.error("unexpected char after internal subset")
238 else:
239 return -1
Walter Dörwald65230a22002-06-03 15:58:32 +0000240 elif c.isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000241 j = j + 1
242 else:
243 self.updatepos(declstartpos, j)
Walter Dörwald70a6b492004-02-12 17:35:32 +0000244 self.error("unexpected char %r in internal subset" % c)
Fred Drake68f8a802001-09-24 20:01:28 +0000245 # end of buffer reached
246 return -1
247
248 # Internal -- scan past <!ELEMENT declarations
249 def _parse_doctype_element(self, i, declstartpos):
Fred Drake68f8a802001-09-24 20:01:28 +0000250 name, j = self._scan_name(i, declstartpos)
251 if j == -1:
252 return -1
253 # style content model; just skip until '>'
Fred Drake5445f072001-10-26 18:02:28 +0000254 rawdata = self.rawdata
Fred Drake68f8a802001-09-24 20:01:28 +0000255 if '>' in rawdata[j:]:
Neal Norwitz7ce734c2002-05-31 14:13:04 +0000256 return rawdata.find(">", j) + 1
Fred Drake68f8a802001-09-24 20:01:28 +0000257 return -1
258
259 # Internal -- scan past <!ATTLIST declarations
260 def _parse_doctype_attlist(self, i, declstartpos):
261 rawdata = self.rawdata
262 name, j = self._scan_name(i, declstartpos)
263 c = rawdata[j:j+1]
264 if c == "":
265 return -1
266 if c == ">":
267 return j + 1
268 while 1:
269 # scan a series of attribute descriptions; simplified:
270 # name type [value] [#constraint]
271 name, j = self._scan_name(j, declstartpos)
272 if j < 0:
273 return j
274 c = rawdata[j:j+1]
275 if c == "":
276 return -1
277 if c == "(":
278 # an enumerated type; look for ')'
279 if ")" in rawdata[j:]:
Neal Norwitz7ce734c2002-05-31 14:13:04 +0000280 j = rawdata.find(")", j) + 1
Fred Drake68f8a802001-09-24 20:01:28 +0000281 else:
282 return -1
Walter Dörwald65230a22002-06-03 15:58:32 +0000283 while rawdata[j:j+1].isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000284 j = j + 1
285 if not rawdata[j:]:
286 # end of buffer, incomplete
287 return -1
288 else:
289 name, j = self._scan_name(j, declstartpos)
290 c = rawdata[j:j+1]
291 if not c:
292 return -1
293 if c in "'\"":
294 m = _declstringlit_match(rawdata, j)
295 if m:
296 j = m.end()
297 else:
298 return -1
299 c = rawdata[j:j+1]
300 if not c:
301 return -1
302 if c == "#":
303 if rawdata[j:] == "#":
304 # end of buffer
305 return -1
306 name, j = self._scan_name(j + 1, declstartpos)
307 if j < 0:
308 return j
309 c = rawdata[j:j+1]
310 if not c:
311 return -1
312 if c == '>':
313 # all done
314 return j + 1
315
316 # Internal -- scan past <!NOTATION declarations
317 def _parse_doctype_notation(self, i, declstartpos):
318 name, j = self._scan_name(i, declstartpos)
319 if j < 0:
320 return j
321 rawdata = self.rawdata
322 while 1:
323 c = rawdata[j:j+1]
324 if not c:
325 # end of buffer; incomplete
326 return -1
327 if c == '>':
328 return j + 1
329 if c in "'\"":
330 m = _declstringlit_match(rawdata, j)
331 if not m:
332 return -1
333 j = m.end()
334 else:
335 name, j = self._scan_name(j, declstartpos)
336 if j < 0:
337 return j
338
339 # Internal -- scan past <!ENTITY declarations
340 def _parse_doctype_entity(self, i, declstartpos):
341 rawdata = self.rawdata
342 if rawdata[i:i+1] == "%":
343 j = i + 1
344 while 1:
345 c = rawdata[j:j+1]
346 if not c:
347 return -1
Walter Dörwald65230a22002-06-03 15:58:32 +0000348 if c.isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000349 j = j + 1
350 else:
351 break
352 else:
353 j = i
354 name, j = self._scan_name(j, declstartpos)
355 if j < 0:
356 return j
357 while 1:
358 c = self.rawdata[j:j+1]
359 if not c:
360 return -1
361 if c in "'\"":
362 m = _declstringlit_match(rawdata, j)
363 if m:
364 j = m.end()
365 else:
366 return -1 # incomplete
367 elif c == ">":
368 return j + 1
369 else:
370 name, j = self._scan_name(j, declstartpos)
371 if j < 0:
372 return j
373
374 # Internal -- scan a name token and the new position and the token, or
375 # return -1 if we've reached the end of the buffer.
376 def _scan_name(self, i, declstartpos):
377 rawdata = self.rawdata
378 n = len(rawdata)
379 if i == n:
380 return None, -1
381 m = _declname_match(rawdata, i)
382 if m:
383 s = m.group()
384 name = s.strip()
385 if (i + len(s)) == n:
386 return None, -1 # end of buffer
Neal Norwitz7ce734c2002-05-31 14:13:04 +0000387 return name.lower(), m.end()
Fred Drake68f8a802001-09-24 20:01:28 +0000388 else:
389 self.updatepos(declstartpos, i)
Andrew M. Kuchlingf027ca82004-07-10 21:49:45 +0000390 self.error("expected name token at %r"
391 % rawdata[declstartpos:declstartpos+20])
Fred Drake5445f072001-10-26 18:02:28 +0000392
393 # To be overridden -- handlers for unknown objects
394 def unknown_decl(self, data):
395 pass