blob: 27352a1adf46ca841f395059067538d028e70fca [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""A parser for SGML, using the derived class as a static DTD."""
Guido van Rossum7c750e11995-02-27 13:16:55 +00002
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
Fred Drakefb38c762001-07-16 18:30:35 +00008# and CDATA (character data -- only end tags are special). RCDATA is
9# not supported at all.
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
11
Fred Drakea3bae332001-09-24 20:15:51 +000012import markupbase
Guido van Rossum1fef1811997-10-23 19:09:21 +000013import re
Guido van Rossum7c750e11995-02-27 13:16:55 +000014
Fred Drake58ae8302004-09-09 01:49:58 +000015__all__ = ["SGMLParser", "SGMLParseError"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000016
17# Regular expressions used for parsing
18
Guido van Rossum1fef1811997-10-23 19:09:21 +000019interesting = re.compile('[&<]')
20incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000021 '<([a-zA-Z][^<>]*|'
22 '/([a-zA-Z][^<>]*)?|'
23 '![^<>]*)?')
Guido van Rossum48766511996-03-28 18:45:04 +000024
Guido van Rossum1ad00711998-05-28 22:48:53 +000025entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Guido van Rossum1fef1811997-10-23 19:09:21 +000026charref = re.compile('&#([0-9]+)[^0-9]')
Guido van Rossum48766511996-03-28 18:45:04 +000027
Guido van Rossum1fef1811997-10-23 19:09:21 +000028starttagopen = re.compile('<[>a-zA-Z]')
Guido van Rossum5fdf8521998-08-24 20:59:13 +000029shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
Guido van Rossum1ad00711998-05-28 22:48:53 +000031piclose = re.compile('>')
Guido van Rossum1fef1811997-10-23 19:09:21 +000032endbracket = re.compile('[<>]')
Fred Drakedc191632001-07-05 18:21:57 +000033tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
Guido van Rossum1fef1811997-10-23 19:09:21 +000034attrfind = re.compile(
Fred Drake8600b472001-07-14 05:50:33 +000035 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
Fred Drake75ab1462003-04-29 22:12:55 +000036 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
Guido van Rossum7c750e11995-02-27 13:16:55 +000037
Fred Drake66957372001-03-16 20:04:57 +000038
39class SGMLParseError(RuntimeError):
40 """Exception raised for all parse errors."""
41 pass
42
Guido van Rossum7c750e11995-02-27 13:16:55 +000043
44# SGML parser base class -- find tags and call handler functions.
45# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
46# The dtd is defined by deriving a class which defines methods
47# with special names to handle tags: start_foo and end_foo to handle
48# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
49# (Tags are converted to lower case for this purpose.) The data
50# between tags is passed to the parser by calling self.handle_data()
Jeremy Hyltona05e2932000-06-28 14:48:01 +000051# with some data as argument (the data may be split up in arbitrary
Guido van Rossum7c750e11995-02-27 13:16:55 +000052# chunks). Entity references are passed by calling
53# self.handle_entityref() with the entity reference as argument.
54
Fred Drakea3bae332001-09-24 20:15:51 +000055class SGMLParser(markupbase.ParserBase):
Guido van Rossum7c750e11995-02-27 13:16:55 +000056
Guido van Rossum48766511996-03-28 18:45:04 +000057 def __init__(self, verbose=0):
Fred Drake08f8dd62001-07-19 20:08:04 +000058 """Initialize and reset this instance."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000059 self.verbose = verbose
60 self.reset()
Guido van Rossum7c750e11995-02-27 13:16:55 +000061
Guido van Rossum48766511996-03-28 18:45:04 +000062 def reset(self):
Fred Drake08f8dd62001-07-19 20:08:04 +000063 """Reset this instance. Loses all unprocessed data."""
Martin v. Löwisdc14ab12003-09-20 10:58:38 +000064 self.__starttag_text = None
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000065 self.rawdata = ''
66 self.stack = []
67 self.lasttag = '???'
68 self.nomoretags = 0
69 self.literal = 0
Fred Drakea3bae332001-09-24 20:15:51 +000070 markupbase.ParserBase.reset(self)
Guido van Rossum7c750e11995-02-27 13:16:55 +000071
Guido van Rossum48766511996-03-28 18:45:04 +000072 def setnomoretags(self):
Fred Drake390e9db2001-07-19 20:57:23 +000073 """Enter literal mode (CDATA) till EOF.
74
75 Intended for derived classes only.
76 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000077 self.nomoretags = self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000078
Guido van Rossum48766511996-03-28 18:45:04 +000079 def setliteral(self, *args):
Fred Drake390e9db2001-07-19 20:57:23 +000080 """Enter literal mode (CDATA).
81
82 Intended for derived classes only.
83 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000084 self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000085
Guido van Rossum48766511996-03-28 18:45:04 +000086 def feed(self, data):
Fred Drake390e9db2001-07-19 20:57:23 +000087 """Feed some data to the parser.
88
89 Call this as often as you want, with as little or as much text
90 as you want (may include '\n'). (This just saves the text,
91 all the processing is done by goahead().)
92 """
93
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000094 self.rawdata = self.rawdata + data
95 self.goahead(0)
Guido van Rossum7c750e11995-02-27 13:16:55 +000096
Guido van Rossum48766511996-03-28 18:45:04 +000097 def close(self):
Fred Drake08f8dd62001-07-19 20:08:04 +000098 """Handle the remaining data."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000099 self.goahead(1)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000100
Fred Drakea3bae332001-09-24 20:15:51 +0000101 def error(self, message):
102 raise SGMLParseError(message)
103
Guido van Rossum48766511996-03-28 18:45:04 +0000104 # Internal -- handle data as far as reasonable. May leave state
105 # and data to be processed by a subsequent call. If 'end' is
106 # true, force handling all data as if followed by EOF marker.
107 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000108 rawdata = self.rawdata
109 i = 0
110 n = len(rawdata)
111 while i < n:
112 if self.nomoretags:
113 self.handle_data(rawdata[i:n])
114 i = n
115 break
116 match = interesting.search(rawdata, i)
Fred Drakea3bae332001-09-24 20:15:51 +0000117 if match: j = match.start()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000118 else: j = n
Fred Drakea3bae332001-09-24 20:15:51 +0000119 if i < j:
120 self.handle_data(rawdata[i:j])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000121 i = j
122 if i == n: break
123 if rawdata[i] == '<':
124 if starttagopen.match(rawdata, i):
125 if self.literal:
126 self.handle_data(rawdata[i])
127 i = i+1
128 continue
129 k = self.parse_starttag(i)
130 if k < 0: break
131 i = k
132 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000133 if rawdata.startswith("</", i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000134 k = self.parse_endtag(i)
135 if k < 0: break
Fred Drakea3bae332001-09-24 20:15:51 +0000136 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000137 self.literal = 0
138 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000139 if self.literal:
140 if n > (i + 1):
141 self.handle_data("<")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000142 i = i+1
Fred Drakea3bae332001-09-24 20:15:51 +0000143 else:
144 # incomplete
145 break
146 continue
147 if rawdata.startswith("<!--", i):
Tim Peters0eadaac2003-04-24 16:02:54 +0000148 # Strictly speaking, a comment is --.*--
149 # within a declaration tag <!...>.
150 # This should be removed,
151 # and comments handled only in parse_declaration.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000152 k = self.parse_comment(i)
153 if k < 0: break
Fred Drakea3bae332001-09-24 20:15:51 +0000154 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000155 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000156 if rawdata.startswith("<?", i):
Guido van Rossum1ad00711998-05-28 22:48:53 +0000157 k = self.parse_pi(i)
158 if k < 0: break
159 i = i+k
Tim Peters495ad3c2001-01-15 01:36:40 +0000160 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000161 if rawdata.startswith("<!", i):
Fred Drake66957372001-03-16 20:04:57 +0000162 # This is some sort of declaration; in "HTML as
163 # deployed," this should only be the document type
164 # declaration ("<!DOCTYPE html...>").
165 k = self.parse_declaration(i)
166 if k < 0: break
167 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000168 continue
169 elif rawdata[i] == '&':
Fred Drakefb38c762001-07-16 18:30:35 +0000170 if self.literal:
171 self.handle_data(rawdata[i])
172 i = i+1
173 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000174 match = charref.match(rawdata, i)
175 if match:
176 name = match.group(1)
177 self.handle_charref(name)
178 i = match.end(0)
179 if rawdata[i-1] != ';': i = i-1
180 continue
181 match = entityref.match(rawdata, i)
182 if match:
183 name = match.group(1)
184 self.handle_entityref(name)
185 i = match.end(0)
186 if rawdata[i-1] != ';': i = i-1
187 continue
188 else:
Fred Drakea3bae332001-09-24 20:15:51 +0000189 self.error('neither < nor & ??')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000190 # We get here only if incomplete matches but
191 # nothing else
192 match = incomplete.match(rawdata, i)
193 if not match:
194 self.handle_data(rawdata[i])
195 i = i+1
196 continue
197 j = match.end(0)
198 if j == n:
199 break # Really incomplete
200 self.handle_data(rawdata[i:j])
201 i = j
202 # end while
203 if end and i < n:
204 self.handle_data(rawdata[i:n])
205 i = n
206 self.rawdata = rawdata[i:]
207 # XXX if end: check for empty stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000208
Fred Drakea3bae332001-09-24 20:15:51 +0000209 # Extensions for the DOCTYPE scanner:
210 _decl_otherchars = '='
Fred Drake66957372001-03-16 20:04:57 +0000211
Guido van Rossum1ad00711998-05-28 22:48:53 +0000212 # Internal -- parse processing instr, return length or -1 if not terminated
213 def parse_pi(self, i):
214 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000215 if rawdata[i:i+2] != '<?':
Fred Drakea3bae332001-09-24 20:15:51 +0000216 self.error('unexpected call to parse_pi()')
Guido van Rossum1ad00711998-05-28 22:48:53 +0000217 match = piclose.search(rawdata, i+2)
218 if not match:
219 return -1
220 j = match.start(0)
221 self.handle_pi(rawdata[i+2: j])
222 j = match.end(0)
223 return j-i
Fred Drakeb46696c2000-06-29 18:50:59 +0000224
Fred Drakeb46696c2000-06-29 18:50:59 +0000225 def get_starttag_text(self):
226 return self.__starttag_text
Tim Peters495ad3c2001-01-15 01:36:40 +0000227
Guido van Rossum48766511996-03-28 18:45:04 +0000228 # Internal -- handle starttag, return length or -1 if not terminated
229 def parse_starttag(self, i):
Fred Drakeb46696c2000-06-29 18:50:59 +0000230 self.__starttag_text = None
231 start_pos = i
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000232 rawdata = self.rawdata
233 if shorttagopen.match(rawdata, i):
234 # SGML shorthand: <tag/data/ == <tag>data</tag>
235 # XXX Can data contain &... (entity or char refs)?
236 # XXX Can data contain < or > (tag characters)?
237 # XXX Can there be whitespace before the first /?
238 match = shorttag.match(rawdata, i)
239 if not match:
240 return -1
241 tag, data = match.group(1, 2)
Fred Drakeb46696c2000-06-29 18:50:59 +0000242 self.__starttag_text = '<%s/' % tag
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000243 tag = tag.lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000244 k = match.end(0)
Fred Drakeb46696c2000-06-29 18:50:59 +0000245 self.finish_shorttag(tag, data)
246 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000247 return k
248 # XXX The following should skip matching quotes (' or ")
Fred Drake6ce9fe82006-06-14 05:15:51 +0000249 # As a shortcut way to exit, this isn't so bad, but shouldn't
250 # be used to locate the actual end of the start tag since the
251 # < or > characters may be embedded in an attribute value.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000252 match = endbracket.search(rawdata, i+1)
253 if not match:
254 return -1
255 j = match.start(0)
256 # Now parse the data between i+1 and j into a tag and attrs
257 attrs = []
258 if rawdata[i:i+2] == '<>':
259 # SGML shorthand: <> == <last open tag seen>
260 k = j
261 tag = self.lasttag
262 else:
263 match = tagfind.match(rawdata, i+1)
264 if not match:
Fred Drakea3bae332001-09-24 20:15:51 +0000265 self.error('unexpected call to parse_starttag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000266 k = match.end(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000267 tag = rawdata[i+1:k].lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000268 self.lasttag = tag
269 while k < j:
270 match = attrfind.match(rawdata, k)
271 if not match: break
272 attrname, rest, attrvalue = match.group(1, 2, 3)
273 if not rest:
274 attrvalue = attrname
Georg Brandl7f6b67c2006-04-01 08:35:18 +0000275 else:
Tim Peters480725d2006-04-03 02:46:44 +0000276 if (attrvalue[:1] == "'" == attrvalue[-1:] or
Georg Brandl7f6b67c2006-04-01 08:35:18 +0000277 attrvalue[:1] == '"' == attrvalue[-1:]):
278 # strip quotes
279 attrvalue = attrvalue[1:-1]
280 l = 0
281 new_attrvalue = ''
282 while l < len(attrvalue):
283 av_match = entityref.match(attrvalue, l)
284 if (av_match and av_match.group(1) in self.entitydefs and
285 attrvalue[av_match.end(1)] == ';'):
286 # only substitute entityrefs ending in ';' since
287 # otherwise we may break <a href='?p=x&q=y'>
288 # which is very common
289 new_attrvalue += self.entitydefs[av_match.group(1)]
290 l = av_match.end(0)
291 continue
292 ch_match = charref.match(attrvalue, l)
293 if ch_match:
294 try:
295 char = chr(int(ch_match.group(1)))
296 new_attrvalue += char
297 l = ch_match.end(0)
298 continue
299 except ValueError:
300 # invalid character reference, don't substitute
301 pass
302 # all other cases
303 new_attrvalue += attrvalue[l]
304 l += 1
305 attrvalue = new_attrvalue
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000306 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000307 k = match.end(0)
308 if rawdata[j] == '>':
309 j = j+1
Fred Drakeb46696c2000-06-29 18:50:59 +0000310 self.__starttag_text = rawdata[start_pos:j]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000311 self.finish_starttag(tag, attrs)
312 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000313
314 # Internal -- parse endtag
315 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000316 rawdata = self.rawdata
317 match = endbracket.search(rawdata, i+1)
318 if not match:
319 return -1
320 j = match.start(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000321 tag = rawdata[i+2:j].strip().lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000322 if rawdata[j] == '>':
323 j = j+1
324 self.finish_endtag(tag)
325 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000326
327 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
328 def finish_shorttag(self, tag, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000329 self.finish_starttag(tag, [])
330 self.handle_data(data)
331 self.finish_endtag(tag)
Guido van Rossum48766511996-03-28 18:45:04 +0000332
333 # Internal -- finish processing of start tag
334 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
335 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000336 try:
337 method = getattr(self, 'start_' + tag)
338 except AttributeError:
339 try:
340 method = getattr(self, 'do_' + tag)
341 except AttributeError:
342 self.unknown_starttag(tag, attrs)
343 return -1
344 else:
345 self.handle_starttag(tag, method, attrs)
346 return 0
347 else:
348 self.stack.append(tag)
349 self.handle_starttag(tag, method, attrs)
350 return 1
Guido van Rossum48766511996-03-28 18:45:04 +0000351
352 # Internal -- finish processing of end tag
353 def finish_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000354 if not tag:
355 found = len(self.stack) - 1
356 if found < 0:
357 self.unknown_endtag(tag)
358 return
359 else:
360 if tag not in self.stack:
361 try:
362 method = getattr(self, 'end_' + tag)
363 except AttributeError:
364 self.unknown_endtag(tag)
Guido van Rossumb84ef9b1998-07-07 22:46:11 +0000365 else:
366 self.report_unbalanced(tag)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000367 return
368 found = len(self.stack)
369 for i in range(found):
370 if self.stack[i] == tag: found = i
371 while len(self.stack) > found:
372 tag = self.stack[-1]
373 try:
374 method = getattr(self, 'end_' + tag)
375 except AttributeError:
376 method = None
377 if method:
378 self.handle_endtag(tag, method)
379 else:
380 self.unknown_endtag(tag)
381 del self.stack[-1]
Guido van Rossum7c750e11995-02-27 13:16:55 +0000382
Guido van Rossum48766511996-03-28 18:45:04 +0000383 # Overridable -- handle start tag
384 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000385 method(attrs)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000386
Guido van Rossum48766511996-03-28 18:45:04 +0000387 # Overridable -- handle end tag
388 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000389 method()
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000390
Guido van Rossum48766511996-03-28 18:45:04 +0000391 # Example -- report an unbalanced </...> tag.
392 def report_unbalanced(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000393 if self.verbose:
394 print '*** Unbalanced </' + tag + '>'
395 print '*** Stack:', self.stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000396
Guido van Rossum48766511996-03-28 18:45:04 +0000397 def handle_charref(self, name):
Fred Drake08f8dd62001-07-19 20:08:04 +0000398 """Handle character reference, no need to override."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000399 try:
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000400 n = int(name)
Eric S. Raymond18af5642001-02-09 10:12:19 +0000401 except ValueError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000402 self.unknown_charref(name)
403 return
404 if not 0 <= n <= 255:
405 self.unknown_charref(name)
406 return
407 self.handle_data(chr(n))
Guido van Rossum7c750e11995-02-27 13:16:55 +0000408
Guido van Rossum48766511996-03-28 18:45:04 +0000409 # Definition of entities -- derived classes may override
410 entitydefs = \
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000411 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
Guido van Rossum7c750e11995-02-27 13:16:55 +0000412
Guido van Rossum48766511996-03-28 18:45:04 +0000413 def handle_entityref(self, name):
Fred Drake08f8dd62001-07-19 20:08:04 +0000414 """Handle entity references.
415
416 There should be no need to override this method; it can be
417 tailored by setting up the self.entitydefs mapping appropriately.
418 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000419 table = self.entitydefs
Raymond Hettinger54f02222002-06-01 14:18:47 +0000420 if name in table:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000421 self.handle_data(table[name])
422 else:
423 self.unknown_entityref(name)
424 return
Guido van Rossum7c750e11995-02-27 13:16:55 +0000425
Guido van Rossum48766511996-03-28 18:45:04 +0000426 # Example -- handle data, should be overridden
427 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000428 pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000429
Guido van Rossum48766511996-03-28 18:45:04 +0000430 # Example -- handle comment, could be overridden
431 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000432 pass
Guido van Rossum48766511996-03-28 18:45:04 +0000433
Fred Drake66957372001-03-16 20:04:57 +0000434 # Example -- handle declaration, could be overridden
435 def handle_decl(self, decl):
436 pass
437
Guido van Rossum1ad00711998-05-28 22:48:53 +0000438 # Example -- handle processing instruction, could be overridden
439 def handle_pi(self, data):
440 pass
441
Guido van Rossum48766511996-03-28 18:45:04 +0000442 # To be overridden -- handlers for unknown objects
443 def unknown_starttag(self, tag, attrs): pass
444 def unknown_endtag(self, tag): pass
445 def unknown_charref(self, ref): pass
446 def unknown_entityref(self, ref): pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000447
448
Guido van Rossum48766511996-03-28 18:45:04 +0000449class TestSGMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +0000450
Guido van Rossum48766511996-03-28 18:45:04 +0000451 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000452 self.testdata = ""
453 SGMLParser.__init__(self, verbose)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000454
Guido van Rossum48766511996-03-28 18:45:04 +0000455 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000456 self.testdata = self.testdata + data
Walter Dörwald70a6b492004-02-12 17:35:32 +0000457 if len(repr(self.testdata)) >= 70:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000458 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000459
Guido van Rossum48766511996-03-28 18:45:04 +0000460 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000461 data = self.testdata
462 if data:
463 self.testdata = ""
Walter Dörwald70a6b492004-02-12 17:35:32 +0000464 print 'data:', repr(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000465
Guido van Rossum48766511996-03-28 18:45:04 +0000466 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000467 self.flush()
Walter Dörwald70a6b492004-02-12 17:35:32 +0000468 r = repr(data)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000469 if len(r) > 68:
470 r = r[:32] + '...' + r[-32:]
471 print 'comment:', r
Guido van Rossum7c750e11995-02-27 13:16:55 +0000472
Guido van Rossum48766511996-03-28 18:45:04 +0000473 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000474 self.flush()
475 if not attrs:
476 print 'start tag: <' + tag + '>'
477 else:
478 print 'start tag: <' + tag,
479 for name, value in attrs:
480 print name + '=' + '"' + value + '"',
481 print '>'
Guido van Rossum7c750e11995-02-27 13:16:55 +0000482
Guido van Rossum48766511996-03-28 18:45:04 +0000483 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000484 self.flush()
485 print 'end tag: </' + tag + '>'
Guido van Rossum48766511996-03-28 18:45:04 +0000486
487 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000488 self.flush()
489 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000490
491 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000492 self.flush()
493 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000494
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000495 def unknown_decl(self, data):
496 self.flush()
497 print '*** unknown decl: [' + data + ']'
498
Guido van Rossum48766511996-03-28 18:45:04 +0000499 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000500 SGMLParser.close(self)
501 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000502
503
Guido van Rossum48766511996-03-28 18:45:04 +0000504def test(args = None):
505 import sys
506
Raymond Hettingerf13eb552002-06-02 00:40:05 +0000507 if args is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000508 args = sys.argv[1:]
Guido van Rossum48766511996-03-28 18:45:04 +0000509
510 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000511 args = args[1:]
512 klass = SGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000513 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000514 klass = TestSGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000515
516 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000517 file = args[0]
Guido van Rossum48766511996-03-28 18:45:04 +0000518 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000519 file = 'test.html'
Guido van Rossum48766511996-03-28 18:45:04 +0000520
521 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000522 f = sys.stdin
Guido van Rossum48766511996-03-28 18:45:04 +0000523 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000524 try:
525 f = open(file, 'r')
526 except IOError, msg:
527 print file, ":", msg
528 sys.exit(1)
Guido van Rossum48766511996-03-28 18:45:04 +0000529
530 data = f.read()
531 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000532 f.close()
Guido van Rossum48766511996-03-28 18:45:04 +0000533
534 x = klass()
535 for c in data:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000536 x.feed(c)
Guido van Rossum48766511996-03-28 18:45:04 +0000537 x.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000538
539
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000540if __name__ == '__main__':
Guido van Rossum48766511996-03-28 18:45:04 +0000541 test()