blob: 24808d185beaa1c732c0e3b60292b68467e57b06 [file] [log] [blame]
Fred Drake7ed44e52005-08-23 04:06:46 +00001"""Shared support for scanning document type declarations in HTML and XHTML.
2
3This module is used as a foundation for the HTMLParser and sgmllib
4modules (indirectly, for htmllib as well). It has no documented
5public API and should not be used directly.
6
7"""
Fred Drake68f8a802001-09-24 20:01:28 +00008
9import re
Fred Drake68f8a802001-09-24 20:01:28 +000010
11_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
12_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000013_commentclose = re.compile(r'--\s*>')
14_markedsectionclose = re.compile(r']\s*]\s*>')
15
16# An analysis of the MS-Word extensions is available at
17# http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
18
19_msmarkedsectionclose = re.compile(r']\s*>')
Fred Drake68f8a802001-09-24 20:01:28 +000020
21del re
22
23
24class ParserBase:
25 """Parser base class which provides some common support methods used
26 by the SGML/HTML and XHTML parsers."""
27
Fred Drake5445f072001-10-26 18:02:28 +000028 def __init__(self):
29 if self.__class__ is ParserBase:
30 raise RuntimeError(
31 "markupbase.ParserBase must be subclassed")
32
33 def error(self, message):
34 raise NotImplementedError(
35 "subclasses of ParserBase must override error()")
36
Fred Drake68f8a802001-09-24 20:01:28 +000037 def reset(self):
38 self.lineno = 1
39 self.offset = 0
40
41 def getpos(self):
42 """Return current line number and offset."""
43 return self.lineno, self.offset
44
45 # Internal -- update line number and offset. This should be
46 # called for each piece of data exactly once, in order -- in other
47 # words the concatenation of all the input strings to this
48 # function should be exactly the entire input.
49 def updatepos(self, i, j):
50 if i >= j:
51 return j
52 rawdata = self.rawdata
Neal Norwitz7ce734c2002-05-31 14:13:04 +000053 nlines = rawdata.count("\n", i, j)
Fred Drake68f8a802001-09-24 20:01:28 +000054 if nlines:
55 self.lineno = self.lineno + nlines
Neal Norwitz7ce734c2002-05-31 14:13:04 +000056 pos = rawdata.rindex("\n", i, j) # Should not fail
Fred Drake68f8a802001-09-24 20:01:28 +000057 self.offset = j-(pos+1)
58 else:
59 self.offset = self.offset + j-i
60 return j
61
62 _decl_otherchars = ''
63
64 # Internal -- parse declaration (for use by subclasses).
65 def parse_declaration(self, i):
66 # This is some sort of declaration; in "HTML as
67 # deployed," this should only be the document type
68 # declaration ("<!DOCTYPE html...>").
Tim Peters0eadaac2003-04-24 16:02:54 +000069 # ISO 8879:1986, however, has more complex
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000070 # declaration syntax for elements in <!...>, including:
71 # --comment--
72 # [marked section]
Tim Peters0eadaac2003-04-24 16:02:54 +000073 # name in the following list: ENTITY, DOCTYPE, ELEMENT,
74 # ATTLIST, NOTATION, SHORTREF, USEMAP,
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000075 # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
Fred Drake68f8a802001-09-24 20:01:28 +000076 rawdata = self.rawdata
Fred Drake68f8a802001-09-24 20:01:28 +000077 j = i + 2
78 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
Georg Brandld09def32006-03-09 13:27:14 +000079 if rawdata[j:j+1] == ">":
80 # the empty comment <!>
81 return j + 1
Fred Drake68f8a802001-09-24 20:01:28 +000082 if rawdata[j:j+1] in ("-", ""):
83 # Start of comment followed by buffer boundary,
84 # or just a buffer boundary.
85 return -1
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000086 # A simple, practical version could look like: ((name|stringlit) S*) + '>'
Fred Drake68f8a802001-09-24 20:01:28 +000087 n = len(rawdata)
Georg Brandld09def32006-03-09 13:27:14 +000088 if rawdata[j:j+2] == '--': #comment
Martin v. Löwis3163a3b2003-03-30 14:25:40 +000089 # Locate --.*-- as the body of the comment
90 return self.parse_comment(i)
91 elif rawdata[j] == '[': #marked section
92 # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
93 # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
94 # Note that this is extended by Microsoft Office "Save as Web" function
95 # to include [if...] and [endif].
96 return self.parse_marked_section(i)
97 else: #all other declaration elements
98 decltype, j = self._scan_name(j, i)
Fred Drake68f8a802001-09-24 20:01:28 +000099 if j < 0:
100 return j
101 if decltype == "doctype":
102 self._decl_otherchars = ''
103 while j < n:
104 c = rawdata[j]
105 if c == ">":
106 # end of declaration syntax
107 data = rawdata[i+2:j]
108 if decltype == "doctype":
109 self.handle_decl(data)
110 else:
111 self.unknown_decl(data)
112 return j + 1
113 if c in "\"'":
114 m = _declstringlit_match(rawdata, j)
115 if not m:
116 return -1 # incomplete
117 j = m.end()
118 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
119 name, j = self._scan_name(j, i)
120 elif c in self._decl_otherchars:
121 j = j + 1
122 elif c == "[":
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000123 # this could be handled in a separate doctype parser
Fred Drake68f8a802001-09-24 20:01:28 +0000124 if decltype == "doctype":
125 j = self._parse_doctype_subset(j + 1, i)
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000126 elif decltype in ("attlist", "linktype", "link", "element"):
127 # must tolerate []'d groups in a content model in an element declaration
128 # also in data attribute specifications of attlist declaration
129 # also link type declaration subsets in linktype declarations
130 # also link attribute specification lists in link declarations
131 self.error("unsupported '[' char in %s declaration" % decltype)
Fred Drake68f8a802001-09-24 20:01:28 +0000132 else:
133 self.error("unexpected '[' char in declaration")
134 else:
135 self.error(
Walter Dörwald70a6b492004-02-12 17:35:32 +0000136 "unexpected %r char in declaration" % rawdata[j])
Fred Drake68f8a802001-09-24 20:01:28 +0000137 if j < 0:
138 return j
139 return -1 # incomplete
140
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000141 # Internal -- parse a marked section
142 # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
Thomas Wouters73e5a5b2006-06-08 15:35:45 +0000143 def parse_marked_section(self, i, report=1):
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000144 rawdata= self.rawdata
145 assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
146 sectName, j = self._scan_name( i+3, i )
147 if j < 0:
148 return j
149 if sectName in ("temp", "cdata", "ignore", "include", "rcdata"):
150 # look for standard ]]> ending
151 match= _markedsectionclose.search(rawdata, i+3)
152 elif sectName in ("if", "else", "endif"):
153 # look for MS Office ]> ending
154 match= _msmarkedsectionclose.search(rawdata, i+3)
155 else:
Walter Dörwald70a6b492004-02-12 17:35:32 +0000156 self.error('unknown status keyword %r in marked section' % rawdata[i+3:j])
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000157 if not match:
158 return -1
159 if report:
160 j = match.start(0)
161 self.unknown_decl(rawdata[i+3: j])
162 return match.end(0)
Tim Peters0eadaac2003-04-24 16:02:54 +0000163
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000164 # Internal -- parse comment, return length or -1 if not terminated
165 def parse_comment(self, i, report=1):
166 rawdata = self.rawdata
167 if rawdata[i:i+4] != '<!--':
168 self.error('unexpected call to parse_comment()')
169 match = _commentclose.search(rawdata, i+4)
170 if not match:
171 return -1
172 if report:
173 j = match.start(0)
174 self.handle_comment(rawdata[i+4: j])
175 return match.end(0)
176
Fred Drake68f8a802001-09-24 20:01:28 +0000177 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
178 # returning the index just past any whitespace following the trailing ']'.
179 def _parse_doctype_subset(self, i, declstartpos):
180 rawdata = self.rawdata
181 n = len(rawdata)
182 j = i
183 while j < n:
184 c = rawdata[j]
185 if c == "<":
186 s = rawdata[j:j+2]
187 if s == "<":
188 # end of buffer; incomplete
189 return -1
190 if s != "<!":
191 self.updatepos(declstartpos, j + 1)
Walter Dörwald70a6b492004-02-12 17:35:32 +0000192 self.error("unexpected char in internal subset (in %r)" % s)
Fred Drake68f8a802001-09-24 20:01:28 +0000193 if (j + 2) == n:
194 # end of buffer; incomplete
195 return -1
196 if (j + 4) > n:
197 # end of buffer; incomplete
198 return -1
199 if rawdata[j:j+4] == "<!--":
200 j = self.parse_comment(j, report=0)
201 if j < 0:
202 return j
203 continue
204 name, j = self._scan_name(j + 2, declstartpos)
205 if j == -1:
206 return -1
207 if name not in ("attlist", "element", "entity", "notation"):
208 self.updatepos(declstartpos, j + 2)
209 self.error(
Walter Dörwald70a6b492004-02-12 17:35:32 +0000210 "unknown declaration %r in internal subset" % name)
Fred Drake68f8a802001-09-24 20:01:28 +0000211 # handle the individual names
212 meth = getattr(self, "_parse_doctype_" + name)
213 j = meth(j, declstartpos)
214 if j < 0:
215 return j
216 elif c == "%":
217 # parameter entity reference
218 if (j + 1) == n:
219 # end of buffer; incomplete
220 return -1
221 s, j = self._scan_name(j + 1, declstartpos)
222 if j < 0:
223 return j
224 if rawdata[j] == ";":
225 j = j + 1
226 elif c == "]":
227 j = j + 1
Walter Dörwald65230a22002-06-03 15:58:32 +0000228 while j < n and rawdata[j].isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000229 j = j + 1
230 if j < n:
231 if rawdata[j] == ">":
232 return j
233 self.updatepos(declstartpos, j)
234 self.error("unexpected char after internal subset")
235 else:
236 return -1
Walter Dörwald65230a22002-06-03 15:58:32 +0000237 elif c.isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000238 j = j + 1
239 else:
240 self.updatepos(declstartpos, j)
Walter Dörwald70a6b492004-02-12 17:35:32 +0000241 self.error("unexpected char %r in internal subset" % c)
Fred Drake68f8a802001-09-24 20:01:28 +0000242 # end of buffer reached
243 return -1
244
245 # Internal -- scan past <!ELEMENT declarations
246 def _parse_doctype_element(self, i, declstartpos):
Fred Drake68f8a802001-09-24 20:01:28 +0000247 name, j = self._scan_name(i, declstartpos)
248 if j == -1:
249 return -1
250 # style content model; just skip until '>'
Fred Drake5445f072001-10-26 18:02:28 +0000251 rawdata = self.rawdata
Fred Drake68f8a802001-09-24 20:01:28 +0000252 if '>' in rawdata[j:]:
Neal Norwitz7ce734c2002-05-31 14:13:04 +0000253 return rawdata.find(">", j) + 1
Fred Drake68f8a802001-09-24 20:01:28 +0000254 return -1
255
256 # Internal -- scan past <!ATTLIST declarations
257 def _parse_doctype_attlist(self, i, declstartpos):
258 rawdata = self.rawdata
259 name, j = self._scan_name(i, declstartpos)
260 c = rawdata[j:j+1]
261 if c == "":
262 return -1
263 if c == ">":
264 return j + 1
265 while 1:
266 # scan a series of attribute descriptions; simplified:
267 # name type [value] [#constraint]
268 name, j = self._scan_name(j, declstartpos)
269 if j < 0:
270 return j
271 c = rawdata[j:j+1]
272 if c == "":
273 return -1
274 if c == "(":
275 # an enumerated type; look for ')'
276 if ")" in rawdata[j:]:
Neal Norwitz7ce734c2002-05-31 14:13:04 +0000277 j = rawdata.find(")", j) + 1
Fred Drake68f8a802001-09-24 20:01:28 +0000278 else:
279 return -1
Walter Dörwald65230a22002-06-03 15:58:32 +0000280 while rawdata[j:j+1].isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000281 j = j + 1
282 if not rawdata[j:]:
283 # end of buffer, incomplete
284 return -1
285 else:
286 name, j = self._scan_name(j, declstartpos)
287 c = rawdata[j:j+1]
288 if not c:
289 return -1
290 if c in "'\"":
291 m = _declstringlit_match(rawdata, j)
292 if m:
293 j = m.end()
294 else:
295 return -1
296 c = rawdata[j:j+1]
297 if not c:
298 return -1
299 if c == "#":
300 if rawdata[j:] == "#":
301 # end of buffer
302 return -1
303 name, j = self._scan_name(j + 1, declstartpos)
304 if j < 0:
305 return j
306 c = rawdata[j:j+1]
307 if not c:
308 return -1
309 if c == '>':
310 # all done
311 return j + 1
312
313 # Internal -- scan past <!NOTATION declarations
314 def _parse_doctype_notation(self, i, declstartpos):
315 name, j = self._scan_name(i, declstartpos)
316 if j < 0:
317 return j
318 rawdata = self.rawdata
319 while 1:
320 c = rawdata[j:j+1]
321 if not c:
322 # end of buffer; incomplete
323 return -1
324 if c == '>':
325 return j + 1
326 if c in "'\"":
327 m = _declstringlit_match(rawdata, j)
328 if not m:
329 return -1
330 j = m.end()
331 else:
332 name, j = self._scan_name(j, declstartpos)
333 if j < 0:
334 return j
335
336 # Internal -- scan past <!ENTITY declarations
337 def _parse_doctype_entity(self, i, declstartpos):
338 rawdata = self.rawdata
339 if rawdata[i:i+1] == "%":
340 j = i + 1
341 while 1:
342 c = rawdata[j:j+1]
343 if not c:
344 return -1
Walter Dörwald65230a22002-06-03 15:58:32 +0000345 if c.isspace():
Fred Drake68f8a802001-09-24 20:01:28 +0000346 j = j + 1
347 else:
348 break
349 else:
350 j = i
351 name, j = self._scan_name(j, declstartpos)
352 if j < 0:
353 return j
354 while 1:
355 c = self.rawdata[j:j+1]
356 if not c:
357 return -1
358 if c in "'\"":
359 m = _declstringlit_match(rawdata, j)
360 if m:
361 j = m.end()
362 else:
363 return -1 # incomplete
364 elif c == ">":
365 return j + 1
366 else:
367 name, j = self._scan_name(j, declstartpos)
368 if j < 0:
369 return j
370
371 # Internal -- scan a name token and the new position and the token, or
372 # return -1 if we've reached the end of the buffer.
373 def _scan_name(self, i, declstartpos):
374 rawdata = self.rawdata
375 n = len(rawdata)
376 if i == n:
377 return None, -1
378 m = _declname_match(rawdata, i)
379 if m:
380 s = m.group()
381 name = s.strip()
382 if (i + len(s)) == n:
383 return None, -1 # end of buffer
Neal Norwitz7ce734c2002-05-31 14:13:04 +0000384 return name.lower(), m.end()
Fred Drake68f8a802001-09-24 20:01:28 +0000385 else:
386 self.updatepos(declstartpos, i)
Andrew M. Kuchlingf027ca82004-07-10 21:49:45 +0000387 self.error("expected name token at %r"
388 % rawdata[declstartpos:declstartpos+20])
Fred Drake5445f072001-10-26 18:02:28 +0000389
390 # To be overridden -- handlers for unknown objects
391 def unknown_decl(self, data):
392 pass