blob: 1db5423254c9a89674881babd9025f58660ac68a [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""A parser for SGML, using the derived class as a static DTD."""
Guido van Rossum7c750e11995-02-27 13:16:55 +00002
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
Fred Drakefb38c762001-07-16 18:30:35 +00008# and CDATA (character data -- only end tags are special). RCDATA is
9# not supported at all.
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
11
Fred Drakea3bae332001-09-24 20:15:51 +000012import markupbase
Guido van Rossum1fef1811997-10-23 19:09:21 +000013import re
Guido van Rossum7c750e11995-02-27 13:16:55 +000014
Skip Montanaro0de65802001-02-15 22:15:14 +000015__all__ = ["SGMLParser"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000016
17# Regular expressions used for parsing
18
Guido van Rossum1fef1811997-10-23 19:09:21 +000019interesting = re.compile('[&<]')
20incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000021 '<([a-zA-Z][^<>]*|'
22 '/([a-zA-Z][^<>]*)?|'
23 '![^<>]*)?')
Guido van Rossum48766511996-03-28 18:45:04 +000024
Guido van Rossum1ad00711998-05-28 22:48:53 +000025entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Guido van Rossum1fef1811997-10-23 19:09:21 +000026charref = re.compile('&#([0-9]+)[^0-9]')
Guido van Rossum48766511996-03-28 18:45:04 +000027
Guido van Rossum1fef1811997-10-23 19:09:21 +000028starttagopen = re.compile('<[>a-zA-Z]')
Guido van Rossum5fdf8521998-08-24 20:59:13 +000029shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
Guido van Rossum1ad00711998-05-28 22:48:53 +000031piclose = re.compile('>')
Guido van Rossum1fef1811997-10-23 19:09:21 +000032endbracket = re.compile('[<>]')
Fred Drake62dfed92001-03-14 16:18:56 +000033commentclose = re.compile(r'--\s*>')
Fred Drakedc191632001-07-05 18:21:57 +000034tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
Guido van Rossum1fef1811997-10-23 19:09:21 +000035attrfind = re.compile(
Fred Drake8600b472001-07-14 05:50:33 +000036 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
Fred Drakedc191632001-07-05 18:21:57 +000037 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
Guido van Rossum7c750e11995-02-27 13:16:55 +000038
Fred Drake66957372001-03-16 20:04:57 +000039
40class SGMLParseError(RuntimeError):
41 """Exception raised for all parse errors."""
42 pass
43
Guido van Rossum7c750e11995-02-27 13:16:55 +000044
45# SGML parser base class -- find tags and call handler functions.
46# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
47# The dtd is defined by deriving a class which defines methods
48# with special names to handle tags: start_foo and end_foo to handle
49# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
50# (Tags are converted to lower case for this purpose.) The data
51# between tags is passed to the parser by calling self.handle_data()
Jeremy Hyltona05e2932000-06-28 14:48:01 +000052# with some data as argument (the data may be split up in arbitrary
Guido van Rossum7c750e11995-02-27 13:16:55 +000053# chunks). Entity references are passed by calling
54# self.handle_entityref() with the entity reference as argument.
55
Fred Drakea3bae332001-09-24 20:15:51 +000056class SGMLParser(markupbase.ParserBase):
Guido van Rossum7c750e11995-02-27 13:16:55 +000057
Guido van Rossum48766511996-03-28 18:45:04 +000058 def __init__(self, verbose=0):
Fred Drake08f8dd62001-07-19 20:08:04 +000059 """Initialize and reset this instance."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000060 self.verbose = verbose
61 self.reset()
Guido van Rossum7c750e11995-02-27 13:16:55 +000062
Guido van Rossum48766511996-03-28 18:45:04 +000063 def reset(self):
Fred Drake08f8dd62001-07-19 20:08:04 +000064 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000065 self.rawdata = ''
66 self.stack = []
67 self.lasttag = '???'
68 self.nomoretags = 0
69 self.literal = 0
Fred Drakea3bae332001-09-24 20:15:51 +000070 markupbase.ParserBase.reset(self)
Guido van Rossum7c750e11995-02-27 13:16:55 +000071
Guido van Rossum48766511996-03-28 18:45:04 +000072 def setnomoretags(self):
Fred Drake390e9db2001-07-19 20:57:23 +000073 """Enter literal mode (CDATA) till EOF.
74
75 Intended for derived classes only.
76 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000077 self.nomoretags = self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000078
Guido van Rossum48766511996-03-28 18:45:04 +000079 def setliteral(self, *args):
Fred Drake390e9db2001-07-19 20:57:23 +000080 """Enter literal mode (CDATA).
81
82 Intended for derived classes only.
83 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000084 self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000085
Guido van Rossum48766511996-03-28 18:45:04 +000086 def feed(self, data):
Fred Drake390e9db2001-07-19 20:57:23 +000087 """Feed some data to the parser.
88
89 Call this as often as you want, with as little or as much text
90 as you want (may include '\n'). (This just saves the text,
91 all the processing is done by goahead().)
92 """
93
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000094 self.rawdata = self.rawdata + data
95 self.goahead(0)
Guido van Rossum7c750e11995-02-27 13:16:55 +000096
Guido van Rossum48766511996-03-28 18:45:04 +000097 def close(self):
Fred Drake08f8dd62001-07-19 20:08:04 +000098 """Handle the remaining data."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000099 self.goahead(1)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000100
Fred Drakea3bae332001-09-24 20:15:51 +0000101 def error(self, message):
102 raise SGMLParseError(message)
103
Guido van Rossum48766511996-03-28 18:45:04 +0000104 # Internal -- handle data as far as reasonable. May leave state
105 # and data to be processed by a subsequent call. If 'end' is
106 # true, force handling all data as if followed by EOF marker.
107 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000108 rawdata = self.rawdata
109 i = 0
110 n = len(rawdata)
111 while i < n:
112 if self.nomoretags:
113 self.handle_data(rawdata[i:n])
114 i = n
115 break
116 match = interesting.search(rawdata, i)
Fred Drakea3bae332001-09-24 20:15:51 +0000117 if match: j = match.start()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000118 else: j = n
Fred Drakea3bae332001-09-24 20:15:51 +0000119 if i < j:
120 self.handle_data(rawdata[i:j])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000121 i = j
122 if i == n: break
123 if rawdata[i] == '<':
124 if starttagopen.match(rawdata, i):
125 if self.literal:
126 self.handle_data(rawdata[i])
127 i = i+1
128 continue
129 k = self.parse_starttag(i)
130 if k < 0: break
131 i = k
132 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000133 if rawdata.startswith("</", i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000134 k = self.parse_endtag(i)
135 if k < 0: break
Fred Drakea3bae332001-09-24 20:15:51 +0000136 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000137 self.literal = 0
138 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000139 if self.literal:
140 if n > (i + 1):
141 self.handle_data("<")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000142 i = i+1
Fred Drakea3bae332001-09-24 20:15:51 +0000143 else:
144 # incomplete
145 break
146 continue
147 if rawdata.startswith("<!--", i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000148 k = self.parse_comment(i)
149 if k < 0: break
Fred Drakea3bae332001-09-24 20:15:51 +0000150 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000151 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000152 if rawdata.startswith("<?", i):
Guido van Rossum1ad00711998-05-28 22:48:53 +0000153 k = self.parse_pi(i)
154 if k < 0: break
155 i = i+k
Tim Peters495ad3c2001-01-15 01:36:40 +0000156 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000157 if rawdata.startswith("<!", i):
Fred Drake66957372001-03-16 20:04:57 +0000158 # This is some sort of declaration; in "HTML as
159 # deployed," this should only be the document type
160 # declaration ("<!DOCTYPE html...>").
161 k = self.parse_declaration(i)
162 if k < 0: break
163 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000164 continue
165 elif rawdata[i] == '&':
Fred Drakefb38c762001-07-16 18:30:35 +0000166 if self.literal:
167 self.handle_data(rawdata[i])
168 i = i+1
169 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000170 match = charref.match(rawdata, i)
171 if match:
172 name = match.group(1)
173 self.handle_charref(name)
174 i = match.end(0)
175 if rawdata[i-1] != ';': i = i-1
176 continue
177 match = entityref.match(rawdata, i)
178 if match:
179 name = match.group(1)
180 self.handle_entityref(name)
181 i = match.end(0)
182 if rawdata[i-1] != ';': i = i-1
183 continue
184 else:
Fred Drakea3bae332001-09-24 20:15:51 +0000185 self.error('neither < nor & ??')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000186 # We get here only if incomplete matches but
187 # nothing else
188 match = incomplete.match(rawdata, i)
189 if not match:
190 self.handle_data(rawdata[i])
191 i = i+1
192 continue
193 j = match.end(0)
194 if j == n:
195 break # Really incomplete
196 self.handle_data(rawdata[i:j])
197 i = j
198 # end while
199 if end and i < n:
200 self.handle_data(rawdata[i:n])
201 i = n
202 self.rawdata = rawdata[i:]
203 # XXX if end: check for empty stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000204
Guido van Rossum48766511996-03-28 18:45:04 +0000205 # Internal -- parse comment, return length or -1 if not terminated
Fred Drakea3bae332001-09-24 20:15:51 +0000206 def parse_comment(self, i, report=1):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000207 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000208 if rawdata[i:i+4] != '<!--':
Fred Drakea3bae332001-09-24 20:15:51 +0000209 self.error('unexpected call to parse_comment()')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000210 match = commentclose.search(rawdata, i+4)
211 if not match:
212 return -1
Fred Drakea3bae332001-09-24 20:15:51 +0000213 if report:
214 j = match.start(0)
215 self.handle_comment(rawdata[i+4: j])
216 return match.end(0)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000217
Fred Drakea3bae332001-09-24 20:15:51 +0000218 # Extensions for the DOCTYPE scanner:
219 _decl_otherchars = '='
Fred Drake66957372001-03-16 20:04:57 +0000220
Guido van Rossum1ad00711998-05-28 22:48:53 +0000221 # Internal -- parse processing instr, return length or -1 if not terminated
222 def parse_pi(self, i):
223 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000224 if rawdata[i:i+2] != '<?':
Fred Drakea3bae332001-09-24 20:15:51 +0000225 self.error('unexpected call to parse_pi()')
Guido van Rossum1ad00711998-05-28 22:48:53 +0000226 match = piclose.search(rawdata, i+2)
227 if not match:
228 return -1
229 j = match.start(0)
230 self.handle_pi(rawdata[i+2: j])
231 j = match.end(0)
232 return j-i
Fred Drakeb46696c2000-06-29 18:50:59 +0000233
234 __starttag_text = None
235 def get_starttag_text(self):
236 return self.__starttag_text
Tim Peters495ad3c2001-01-15 01:36:40 +0000237
Guido van Rossum48766511996-03-28 18:45:04 +0000238 # Internal -- handle starttag, return length or -1 if not terminated
239 def parse_starttag(self, i):
Fred Drakeb46696c2000-06-29 18:50:59 +0000240 self.__starttag_text = None
241 start_pos = i
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000242 rawdata = self.rawdata
243 if shorttagopen.match(rawdata, i):
244 # SGML shorthand: <tag/data/ == <tag>data</tag>
245 # XXX Can data contain &... (entity or char refs)?
246 # XXX Can data contain < or > (tag characters)?
247 # XXX Can there be whitespace before the first /?
248 match = shorttag.match(rawdata, i)
249 if not match:
250 return -1
251 tag, data = match.group(1, 2)
Fred Drakeb46696c2000-06-29 18:50:59 +0000252 self.__starttag_text = '<%s/' % tag
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000253 tag = tag.lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000254 k = match.end(0)
Fred Drakeb46696c2000-06-29 18:50:59 +0000255 self.finish_shorttag(tag, data)
256 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000257 return k
258 # XXX The following should skip matching quotes (' or ")
259 match = endbracket.search(rawdata, i+1)
260 if not match:
261 return -1
262 j = match.start(0)
263 # Now parse the data between i+1 and j into a tag and attrs
264 attrs = []
265 if rawdata[i:i+2] == '<>':
266 # SGML shorthand: <> == <last open tag seen>
267 k = j
268 tag = self.lasttag
269 else:
270 match = tagfind.match(rawdata, i+1)
271 if not match:
Fred Drakea3bae332001-09-24 20:15:51 +0000272 self.error('unexpected call to parse_starttag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000273 k = match.end(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000274 tag = rawdata[i+1:k].lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000275 self.lasttag = tag
276 while k < j:
277 match = attrfind.match(rawdata, k)
278 if not match: break
279 attrname, rest, attrvalue = match.group(1, 2, 3)
280 if not rest:
281 attrvalue = attrname
282 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
283 attrvalue[:1] == '"' == attrvalue[-1:]:
284 attrvalue = attrvalue[1:-1]
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000285 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000286 k = match.end(0)
287 if rawdata[j] == '>':
288 j = j+1
Fred Drakeb46696c2000-06-29 18:50:59 +0000289 self.__starttag_text = rawdata[start_pos:j]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000290 self.finish_starttag(tag, attrs)
291 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000292
293 # Internal -- parse endtag
294 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000295 rawdata = self.rawdata
296 match = endbracket.search(rawdata, i+1)
297 if not match:
298 return -1
299 j = match.start(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000300 tag = rawdata[i+2:j].strip().lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000301 if rawdata[j] == '>':
302 j = j+1
303 self.finish_endtag(tag)
304 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000305
306 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
307 def finish_shorttag(self, tag, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000308 self.finish_starttag(tag, [])
309 self.handle_data(data)
310 self.finish_endtag(tag)
Guido van Rossum48766511996-03-28 18:45:04 +0000311
312 # Internal -- finish processing of start tag
313 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
314 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000315 try:
316 method = getattr(self, 'start_' + tag)
317 except AttributeError:
318 try:
319 method = getattr(self, 'do_' + tag)
320 except AttributeError:
321 self.unknown_starttag(tag, attrs)
322 return -1
323 else:
324 self.handle_starttag(tag, method, attrs)
325 return 0
326 else:
327 self.stack.append(tag)
328 self.handle_starttag(tag, method, attrs)
329 return 1
Guido van Rossum48766511996-03-28 18:45:04 +0000330
331 # Internal -- finish processing of end tag
332 def finish_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000333 if not tag:
334 found = len(self.stack) - 1
335 if found < 0:
336 self.unknown_endtag(tag)
337 return
338 else:
339 if tag not in self.stack:
340 try:
341 method = getattr(self, 'end_' + tag)
342 except AttributeError:
343 self.unknown_endtag(tag)
Guido van Rossumb84ef9b1998-07-07 22:46:11 +0000344 else:
345 self.report_unbalanced(tag)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000346 return
347 found = len(self.stack)
348 for i in range(found):
349 if self.stack[i] == tag: found = i
350 while len(self.stack) > found:
351 tag = self.stack[-1]
352 try:
353 method = getattr(self, 'end_' + tag)
354 except AttributeError:
355 method = None
356 if method:
357 self.handle_endtag(tag, method)
358 else:
359 self.unknown_endtag(tag)
360 del self.stack[-1]
Guido van Rossum7c750e11995-02-27 13:16:55 +0000361
Guido van Rossum48766511996-03-28 18:45:04 +0000362 # Overridable -- handle start tag
363 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000364 method(attrs)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000365
Guido van Rossum48766511996-03-28 18:45:04 +0000366 # Overridable -- handle end tag
367 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000368 method()
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000369
Guido van Rossum48766511996-03-28 18:45:04 +0000370 # Example -- report an unbalanced </...> tag.
371 def report_unbalanced(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000372 if self.verbose:
373 print '*** Unbalanced </' + tag + '>'
374 print '*** Stack:', self.stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000375
Guido van Rossum48766511996-03-28 18:45:04 +0000376 def handle_charref(self, name):
Fred Drake08f8dd62001-07-19 20:08:04 +0000377 """Handle character reference, no need to override."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000378 try:
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000379 n = int(name)
Eric S. Raymond18af5642001-02-09 10:12:19 +0000380 except ValueError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000381 self.unknown_charref(name)
382 return
383 if not 0 <= n <= 255:
384 self.unknown_charref(name)
385 return
386 self.handle_data(chr(n))
Guido van Rossum7c750e11995-02-27 13:16:55 +0000387
Guido van Rossum48766511996-03-28 18:45:04 +0000388 # Definition of entities -- derived classes may override
389 entitydefs = \
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000390 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
Guido van Rossum7c750e11995-02-27 13:16:55 +0000391
Guido van Rossum48766511996-03-28 18:45:04 +0000392 def handle_entityref(self, name):
Fred Drake08f8dd62001-07-19 20:08:04 +0000393 """Handle entity references.
394
395 There should be no need to override this method; it can be
396 tailored by setting up the self.entitydefs mapping appropriately.
397 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000398 table = self.entitydefs
399 if table.has_key(name):
400 self.handle_data(table[name])
401 else:
402 self.unknown_entityref(name)
403 return
Guido van Rossum7c750e11995-02-27 13:16:55 +0000404
Guido van Rossum48766511996-03-28 18:45:04 +0000405 # Example -- handle data, should be overridden
406 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000407 pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000408
Guido van Rossum48766511996-03-28 18:45:04 +0000409 # Example -- handle comment, could be overridden
410 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000411 pass
Guido van Rossum48766511996-03-28 18:45:04 +0000412
Fred Drake66957372001-03-16 20:04:57 +0000413 # Example -- handle declaration, could be overridden
414 def handle_decl(self, decl):
415 pass
416
Guido van Rossum1ad00711998-05-28 22:48:53 +0000417 # Example -- handle processing instruction, could be overridden
418 def handle_pi(self, data):
419 pass
420
Guido van Rossum48766511996-03-28 18:45:04 +0000421 # To be overridden -- handlers for unknown objects
422 def unknown_starttag(self, tag, attrs): pass
423 def unknown_endtag(self, tag): pass
424 def unknown_charref(self, ref): pass
425 def unknown_entityref(self, ref): pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000426
427
Guido van Rossum48766511996-03-28 18:45:04 +0000428class TestSGMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +0000429
Guido van Rossum48766511996-03-28 18:45:04 +0000430 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000431 self.testdata = ""
432 SGMLParser.__init__(self, verbose)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000433
Guido van Rossum48766511996-03-28 18:45:04 +0000434 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000435 self.testdata = self.testdata + data
436 if len(`self.testdata`) >= 70:
437 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000438
Guido van Rossum48766511996-03-28 18:45:04 +0000439 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000440 data = self.testdata
441 if data:
442 self.testdata = ""
443 print 'data:', `data`
Guido van Rossum7c750e11995-02-27 13:16:55 +0000444
Guido van Rossum48766511996-03-28 18:45:04 +0000445 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000446 self.flush()
447 r = `data`
448 if len(r) > 68:
449 r = r[:32] + '...' + r[-32:]
450 print 'comment:', r
Guido van Rossum7c750e11995-02-27 13:16:55 +0000451
Guido van Rossum48766511996-03-28 18:45:04 +0000452 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000453 self.flush()
454 if not attrs:
455 print 'start tag: <' + tag + '>'
456 else:
457 print 'start tag: <' + tag,
458 for name, value in attrs:
459 print name + '=' + '"' + value + '"',
460 print '>'
Guido van Rossum7c750e11995-02-27 13:16:55 +0000461
Guido van Rossum48766511996-03-28 18:45:04 +0000462 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000463 self.flush()
464 print 'end tag: </' + tag + '>'
Guido van Rossum48766511996-03-28 18:45:04 +0000465
466 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000467 self.flush()
468 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000469
470 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000471 self.flush()
472 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000473
474 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000475 SGMLParser.close(self)
476 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000477
478
Guido van Rossum48766511996-03-28 18:45:04 +0000479def test(args = None):
480 import sys
481
482 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000483 args = sys.argv[1:]
Guido van Rossum48766511996-03-28 18:45:04 +0000484
485 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000486 args = args[1:]
487 klass = SGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000488 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000489 klass = TestSGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000490
491 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000492 file = args[0]
Guido van Rossum48766511996-03-28 18:45:04 +0000493 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000494 file = 'test.html'
Guido van Rossum48766511996-03-28 18:45:04 +0000495
496 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000497 f = sys.stdin
Guido van Rossum48766511996-03-28 18:45:04 +0000498 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000499 try:
500 f = open(file, 'r')
501 except IOError, msg:
502 print file, ":", msg
503 sys.exit(1)
Guido van Rossum48766511996-03-28 18:45:04 +0000504
505 data = f.read()
506 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000507 f.close()
Guido van Rossum48766511996-03-28 18:45:04 +0000508
509 x = klass()
510 for c in data:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000511 x.feed(c)
Guido van Rossum48766511996-03-28 18:45:04 +0000512 x.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000513
514
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000515if __name__ == '__main__':
Guido van Rossum48766511996-03-28 18:45:04 +0000516 test()