blob: 3422980834019c03e765e154875aa40a7d65d2ca [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""A parser for SGML, using the derived class as a static DTD."""
Guido van Rossum7c750e11995-02-27 13:16:55 +00002
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
Fred Drakefb38c762001-07-16 18:30:35 +00008# and CDATA (character data -- only end tags are special). RCDATA is
9# not supported at all.
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
11
Guido van Rossum1fef1811997-10-23 19:09:21 +000012import re
Guido van Rossum7c750e11995-02-27 13:16:55 +000013import string
14
Skip Montanaro0de65802001-02-15 22:15:14 +000015__all__ = ["SGMLParser"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000016
17# Regular expressions used for parsing
18
Guido van Rossum1fef1811997-10-23 19:09:21 +000019interesting = re.compile('[&<]')
20incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000021 '<([a-zA-Z][^<>]*|'
22 '/([a-zA-Z][^<>]*)?|'
23 '![^<>]*)?')
Guido van Rossum48766511996-03-28 18:45:04 +000024
Guido van Rossum1ad00711998-05-28 22:48:53 +000025entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Guido van Rossum1fef1811997-10-23 19:09:21 +000026charref = re.compile('&#([0-9]+)[^0-9]')
Guido van Rossum48766511996-03-28 18:45:04 +000027
Guido van Rossum1fef1811997-10-23 19:09:21 +000028starttagopen = re.compile('<[>a-zA-Z]')
Guido van Rossum5fdf8521998-08-24 20:59:13 +000029shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
Guido van Rossum1ad00711998-05-28 22:48:53 +000031piopen = re.compile('<\?')
32piclose = re.compile('>')
Guido van Rossum1fef1811997-10-23 19:09:21 +000033endtagopen = re.compile('</[<>a-zA-Z]')
34endbracket = re.compile('[<>]')
35special = re.compile('<![^<>]*>')
36commentopen = re.compile('<!--')
Fred Drake62dfed92001-03-14 16:18:56 +000037commentclose = re.compile(r'--\s*>')
Fred Drakefb38c762001-07-16 18:30:35 +000038declopen = re.compile('<!')
39declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
40declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
Fred Drakedc191632001-07-05 18:21:57 +000041tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
Guido van Rossum1fef1811997-10-23 19:09:21 +000042attrfind = re.compile(
Fred Drake8600b472001-07-14 05:50:33 +000043 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
Fred Drakedc191632001-07-05 18:21:57 +000044 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
Guido van Rossum7c750e11995-02-27 13:16:55 +000045
Guido van Rossum39d34512001-05-21 20:17:17 +000046decldata = re.compile(r'[^>\'\"]+')
Fred Drake66957372001-03-16 20:04:57 +000047declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
48
49
50class SGMLParseError(RuntimeError):
51 """Exception raised for all parse errors."""
52 pass
53
Guido van Rossum7c750e11995-02-27 13:16:55 +000054
55# SGML parser base class -- find tags and call handler functions.
56# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
57# The dtd is defined by deriving a class which defines methods
58# with special names to handle tags: start_foo and end_foo to handle
59# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
60# (Tags are converted to lower case for this purpose.) The data
61# between tags is passed to the parser by calling self.handle_data()
Jeremy Hyltona05e2932000-06-28 14:48:01 +000062# with some data as argument (the data may be split up in arbitrary
Guido van Rossum7c750e11995-02-27 13:16:55 +000063# chunks). Entity references are passed by calling
64# self.handle_entityref() with the entity reference as argument.
65
66class SGMLParser:
67
Guido van Rossum48766511996-03-28 18:45:04 +000068 # Interface -- initialize and reset this instance
69 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000070 self.verbose = verbose
71 self.reset()
Guido van Rossum7c750e11995-02-27 13:16:55 +000072
Guido van Rossum48766511996-03-28 18:45:04 +000073 # Interface -- reset this instance. Loses all unprocessed data
74 def reset(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000075 self.rawdata = ''
76 self.stack = []
77 self.lasttag = '???'
78 self.nomoretags = 0
79 self.literal = 0
Guido van Rossum7c750e11995-02-27 13:16:55 +000080
Guido van Rossum48766511996-03-28 18:45:04 +000081 # For derived classes only -- enter literal mode (CDATA) till EOF
82 def setnomoretags(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000083 self.nomoretags = self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000084
Guido van Rossum48766511996-03-28 18:45:04 +000085 # For derived classes only -- enter literal mode (CDATA)
86 def setliteral(self, *args):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000087 self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000088
Guido van Rossum48766511996-03-28 18:45:04 +000089 # Interface -- feed some data to the parser. Call this as
90 # often as you want, with as little or as much text as you
91 # want (may include '\n'). (This just saves the text, all the
92 # processing is done by goahead().)
93 def feed(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000094 self.rawdata = self.rawdata + data
95 self.goahead(0)
Guido van Rossum7c750e11995-02-27 13:16:55 +000096
Guido van Rossum48766511996-03-28 18:45:04 +000097 # Interface -- handle the remaining data
98 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000099 self.goahead(1)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000100
Guido van Rossum48766511996-03-28 18:45:04 +0000101 # Internal -- handle data as far as reasonable. May leave state
102 # and data to be processed by a subsequent call. If 'end' is
103 # true, force handling all data as if followed by EOF marker.
104 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000105 rawdata = self.rawdata
106 i = 0
107 n = len(rawdata)
108 while i < n:
109 if self.nomoretags:
110 self.handle_data(rawdata[i:n])
111 i = n
112 break
113 match = interesting.search(rawdata, i)
114 if match: j = match.start(0)
115 else: j = n
116 if i < j: self.handle_data(rawdata[i:j])
117 i = j
118 if i == n: break
119 if rawdata[i] == '<':
120 if starttagopen.match(rawdata, i):
121 if self.literal:
122 self.handle_data(rawdata[i])
123 i = i+1
124 continue
125 k = self.parse_starttag(i)
126 if k < 0: break
127 i = k
128 continue
129 if endtagopen.match(rawdata, i):
130 k = self.parse_endtag(i)
131 if k < 0: break
132 i = k
133 self.literal = 0
134 continue
135 if commentopen.match(rawdata, i):
136 if self.literal:
137 self.handle_data(rawdata[i])
138 i = i+1
139 continue
140 k = self.parse_comment(i)
141 if k < 0: break
142 i = i+k
143 continue
Guido van Rossum1ad00711998-05-28 22:48:53 +0000144 if piopen.match(rawdata, i):
145 if self.literal:
146 self.handle_data(rawdata[i])
147 i = i+1
148 continue
149 k = self.parse_pi(i)
150 if k < 0: break
151 i = i+k
Tim Peters495ad3c2001-01-15 01:36:40 +0000152 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000153 match = special.match(rawdata, i)
154 if match:
155 if self.literal:
156 self.handle_data(rawdata[i])
157 i = i+1
158 continue
Fred Drake66957372001-03-16 20:04:57 +0000159 # This is some sort of declaration; in "HTML as
160 # deployed," this should only be the document type
161 # declaration ("<!DOCTYPE html...>").
162 k = self.parse_declaration(i)
163 if k < 0: break
164 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000165 continue
166 elif rawdata[i] == '&':
Fred Drakefb38c762001-07-16 18:30:35 +0000167 if self.literal:
168 self.handle_data(rawdata[i])
169 i = i+1
170 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000171 match = charref.match(rawdata, i)
172 if match:
173 name = match.group(1)
174 self.handle_charref(name)
175 i = match.end(0)
176 if rawdata[i-1] != ';': i = i-1
177 continue
178 match = entityref.match(rawdata, i)
179 if match:
180 name = match.group(1)
181 self.handle_entityref(name)
182 i = match.end(0)
183 if rawdata[i-1] != ';': i = i-1
184 continue
185 else:
Guido van Rossum74cde5b2001-04-15 13:01:41 +0000186 raise SGMLParseError('neither < nor & ??')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000187 # We get here only if incomplete matches but
188 # nothing else
189 match = incomplete.match(rawdata, i)
190 if not match:
191 self.handle_data(rawdata[i])
192 i = i+1
193 continue
194 j = match.end(0)
195 if j == n:
196 break # Really incomplete
197 self.handle_data(rawdata[i:j])
198 i = j
199 # end while
200 if end and i < n:
201 self.handle_data(rawdata[i:n])
202 i = n
203 self.rawdata = rawdata[i:]
204 # XXX if end: check for empty stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000205
Guido van Rossum48766511996-03-28 18:45:04 +0000206 # Internal -- parse comment, return length or -1 if not terminated
207 def parse_comment(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000208 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000209 if rawdata[i:i+4] != '<!--':
Fred Drake66957372001-03-16 20:04:57 +0000210 raise SGMLParseError('unexpected call to parse_comment()')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000211 match = commentclose.search(rawdata, i+4)
212 if not match:
213 return -1
214 j = match.start(0)
215 self.handle_comment(rawdata[i+4: j])
216 j = match.end(0)
217 return j-i
Guido van Rossum7c750e11995-02-27 13:16:55 +0000218
Fred Drake66957372001-03-16 20:04:57 +0000219 # Internal -- parse declaration.
220 def parse_declaration(self, i):
Fred Drakefb38c762001-07-16 18:30:35 +0000221 # This is some sort of declaration; in "HTML as
222 # deployed," this should only be the document type
223 # declaration ("<!DOCTYPE html...>").
Fred Drake66957372001-03-16 20:04:57 +0000224 rawdata = self.rawdata
225 j = i + 2
Fred Drakefb38c762001-07-16 18:30:35 +0000226 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
227 if rawdata[j:j+1] in ("-", ""):
228 # Start of comment followed by buffer boundary,
229 # or just a buffer boundary.
230 return -1
231 # in practice, this should look like: ((name|stringlit) S*)+ '>'
Guido van Rossum39d34512001-05-21 20:17:17 +0000232 n = len(rawdata)
233 while j < n:
Fred Drakefb38c762001-07-16 18:30:35 +0000234 c = rawdata[j]
Fred Drake66957372001-03-16 20:04:57 +0000235 if c == ">":
236 # end of declaration syntax
237 self.handle_decl(rawdata[i+2:j])
238 return j + 1
239 if c in "\"'":
240 m = declstringlit.match(rawdata, j)
241 if not m:
Fred Drakefb38c762001-07-16 18:30:35 +0000242 return -1 # incomplete
243 j = m.end()
244 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
245 m = declname.match(rawdata, j)
246 if not m:
247 return -1 # incomplete
Fred Drake66957372001-03-16 20:04:57 +0000248 j = m.end()
Guido van Rossum39d34512001-05-21 20:17:17 +0000249 else:
Fred Drakefb38c762001-07-16 18:30:35 +0000250 raise SGMLParseError(
251 "unexpected char in declaration: %s" % `rawdata[j]`)
Guido van Rossum39d34512001-05-21 20:17:17 +0000252 # end of buffer between tokens
253 return -1
Fred Drake66957372001-03-16 20:04:57 +0000254
Guido van Rossum1ad00711998-05-28 22:48:53 +0000255 # Internal -- parse processing instr, return length or -1 if not terminated
256 def parse_pi(self, i):
257 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000258 if rawdata[i:i+2] != '<?':
Fred Drake66957372001-03-16 20:04:57 +0000259 raise SGMLParseError('unexpected call to parse_pi()')
Guido van Rossum1ad00711998-05-28 22:48:53 +0000260 match = piclose.search(rawdata, i+2)
261 if not match:
262 return -1
263 j = match.start(0)
264 self.handle_pi(rawdata[i+2: j])
265 j = match.end(0)
266 return j-i
Fred Drakeb46696c2000-06-29 18:50:59 +0000267
268 __starttag_text = None
269 def get_starttag_text(self):
270 return self.__starttag_text
Tim Peters495ad3c2001-01-15 01:36:40 +0000271
Guido van Rossum48766511996-03-28 18:45:04 +0000272 # Internal -- handle starttag, return length or -1 if not terminated
273 def parse_starttag(self, i):
Fred Drakeb46696c2000-06-29 18:50:59 +0000274 self.__starttag_text = None
275 start_pos = i
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000276 rawdata = self.rawdata
277 if shorttagopen.match(rawdata, i):
278 # SGML shorthand: <tag/data/ == <tag>data</tag>
279 # XXX Can data contain &... (entity or char refs)?
280 # XXX Can data contain < or > (tag characters)?
281 # XXX Can there be whitespace before the first /?
282 match = shorttag.match(rawdata, i)
283 if not match:
284 return -1
285 tag, data = match.group(1, 2)
Fred Drakeb46696c2000-06-29 18:50:59 +0000286 self.__starttag_text = '<%s/' % tag
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000287 tag = tag.lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000288 k = match.end(0)
Fred Drakeb46696c2000-06-29 18:50:59 +0000289 self.finish_shorttag(tag, data)
290 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000291 return k
292 # XXX The following should skip matching quotes (' or ")
293 match = endbracket.search(rawdata, i+1)
294 if not match:
295 return -1
296 j = match.start(0)
297 # Now parse the data between i+1 and j into a tag and attrs
298 attrs = []
299 if rawdata[i:i+2] == '<>':
300 # SGML shorthand: <> == <last open tag seen>
301 k = j
302 tag = self.lasttag
303 else:
304 match = tagfind.match(rawdata, i+1)
305 if not match:
Fred Drake66957372001-03-16 20:04:57 +0000306 raise SGMLParseError('unexpected call to parse_starttag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000307 k = match.end(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000308 tag = rawdata[i+1:k].lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000309 self.lasttag = tag
310 while k < j:
311 match = attrfind.match(rawdata, k)
312 if not match: break
313 attrname, rest, attrvalue = match.group(1, 2, 3)
314 if not rest:
315 attrvalue = attrname
316 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
317 attrvalue[:1] == '"' == attrvalue[-1:]:
318 attrvalue = attrvalue[1:-1]
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000319 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000320 k = match.end(0)
321 if rawdata[j] == '>':
322 j = j+1
Fred Drakeb46696c2000-06-29 18:50:59 +0000323 self.__starttag_text = rawdata[start_pos:j]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000324 self.finish_starttag(tag, attrs)
325 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000326
327 # Internal -- parse endtag
328 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000329 rawdata = self.rawdata
330 match = endbracket.search(rawdata, i+1)
331 if not match:
332 return -1
333 j = match.start(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000334 tag = rawdata[i+2:j].strip().lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000335 if rawdata[j] == '>':
336 j = j+1
337 self.finish_endtag(tag)
338 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000339
340 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
341 def finish_shorttag(self, tag, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000342 self.finish_starttag(tag, [])
343 self.handle_data(data)
344 self.finish_endtag(tag)
Guido van Rossum48766511996-03-28 18:45:04 +0000345
346 # Internal -- finish processing of start tag
347 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
348 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000349 try:
350 method = getattr(self, 'start_' + tag)
351 except AttributeError:
352 try:
353 method = getattr(self, 'do_' + tag)
354 except AttributeError:
355 self.unknown_starttag(tag, attrs)
356 return -1
357 else:
358 self.handle_starttag(tag, method, attrs)
359 return 0
360 else:
361 self.stack.append(tag)
362 self.handle_starttag(tag, method, attrs)
363 return 1
Guido van Rossum48766511996-03-28 18:45:04 +0000364
365 # Internal -- finish processing of end tag
366 def finish_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000367 if not tag:
368 found = len(self.stack) - 1
369 if found < 0:
370 self.unknown_endtag(tag)
371 return
372 else:
373 if tag not in self.stack:
374 try:
375 method = getattr(self, 'end_' + tag)
376 except AttributeError:
377 self.unknown_endtag(tag)
Guido van Rossumb84ef9b1998-07-07 22:46:11 +0000378 else:
379 self.report_unbalanced(tag)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000380 return
381 found = len(self.stack)
382 for i in range(found):
383 if self.stack[i] == tag: found = i
384 while len(self.stack) > found:
385 tag = self.stack[-1]
386 try:
387 method = getattr(self, 'end_' + tag)
388 except AttributeError:
389 method = None
390 if method:
391 self.handle_endtag(tag, method)
392 else:
393 self.unknown_endtag(tag)
394 del self.stack[-1]
Guido van Rossum7c750e11995-02-27 13:16:55 +0000395
Guido van Rossum48766511996-03-28 18:45:04 +0000396 # Overridable -- handle start tag
397 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000398 method(attrs)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000399
Guido van Rossum48766511996-03-28 18:45:04 +0000400 # Overridable -- handle end tag
401 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000402 method()
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000403
Guido van Rossum48766511996-03-28 18:45:04 +0000404 # Example -- report an unbalanced </...> tag.
405 def report_unbalanced(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000406 if self.verbose:
407 print '*** Unbalanced </' + tag + '>'
408 print '*** Stack:', self.stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000409
Guido van Rossum48766511996-03-28 18:45:04 +0000410 # Example -- handle character reference, no need to override
411 def handle_charref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000412 try:
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000413 n = int(name)
Eric S. Raymond18af5642001-02-09 10:12:19 +0000414 except ValueError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000415 self.unknown_charref(name)
416 return
417 if not 0 <= n <= 255:
418 self.unknown_charref(name)
419 return
420 self.handle_data(chr(n))
Guido van Rossum7c750e11995-02-27 13:16:55 +0000421
Guido van Rossum48766511996-03-28 18:45:04 +0000422 # Definition of entities -- derived classes may override
423 entitydefs = \
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000424 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
Guido van Rossum7c750e11995-02-27 13:16:55 +0000425
Guido van Rossum48766511996-03-28 18:45:04 +0000426 # Example -- handle entity reference, no need to override
427 def handle_entityref(self, name):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000428 table = self.entitydefs
429 if table.has_key(name):
430 self.handle_data(table[name])
431 else:
432 self.unknown_entityref(name)
433 return
Guido van Rossum7c750e11995-02-27 13:16:55 +0000434
Guido van Rossum48766511996-03-28 18:45:04 +0000435 # Example -- handle data, should be overridden
436 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000437 pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000438
Guido van Rossum48766511996-03-28 18:45:04 +0000439 # Example -- handle comment, could be overridden
440 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000441 pass
Guido van Rossum48766511996-03-28 18:45:04 +0000442
Fred Drake66957372001-03-16 20:04:57 +0000443 # Example -- handle declaration, could be overridden
444 def handle_decl(self, decl):
445 pass
446
Guido van Rossum1ad00711998-05-28 22:48:53 +0000447 # Example -- handle processing instruction, could be overridden
448 def handle_pi(self, data):
449 pass
450
Guido van Rossum48766511996-03-28 18:45:04 +0000451 # To be overridden -- handlers for unknown objects
452 def unknown_starttag(self, tag, attrs): pass
453 def unknown_endtag(self, tag): pass
454 def unknown_charref(self, ref): pass
455 def unknown_entityref(self, ref): pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000456
457
Guido van Rossum48766511996-03-28 18:45:04 +0000458class TestSGMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +0000459
Guido van Rossum48766511996-03-28 18:45:04 +0000460 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000461 self.testdata = ""
462 SGMLParser.__init__(self, verbose)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000463
Guido van Rossum48766511996-03-28 18:45:04 +0000464 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000465 self.testdata = self.testdata + data
466 if len(`self.testdata`) >= 70:
467 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000468
Guido van Rossum48766511996-03-28 18:45:04 +0000469 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000470 data = self.testdata
471 if data:
472 self.testdata = ""
473 print 'data:', `data`
Guido van Rossum7c750e11995-02-27 13:16:55 +0000474
Guido van Rossum48766511996-03-28 18:45:04 +0000475 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000476 self.flush()
477 r = `data`
478 if len(r) > 68:
479 r = r[:32] + '...' + r[-32:]
480 print 'comment:', r
Guido van Rossum7c750e11995-02-27 13:16:55 +0000481
Guido van Rossum48766511996-03-28 18:45:04 +0000482 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000483 self.flush()
484 if not attrs:
485 print 'start tag: <' + tag + '>'
486 else:
487 print 'start tag: <' + tag,
488 for name, value in attrs:
489 print name + '=' + '"' + value + '"',
490 print '>'
Guido van Rossum7c750e11995-02-27 13:16:55 +0000491
Guido van Rossum48766511996-03-28 18:45:04 +0000492 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000493 self.flush()
494 print 'end tag: </' + tag + '>'
Guido van Rossum48766511996-03-28 18:45:04 +0000495
496 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000497 self.flush()
498 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000499
500 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000501 self.flush()
502 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000503
504 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000505 SGMLParser.close(self)
506 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000507
508
Guido van Rossum48766511996-03-28 18:45:04 +0000509def test(args = None):
510 import sys
511
512 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000513 args = sys.argv[1:]
Guido van Rossum48766511996-03-28 18:45:04 +0000514
515 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000516 args = args[1:]
517 klass = SGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000518 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000519 klass = TestSGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000520
521 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000522 file = args[0]
Guido van Rossum48766511996-03-28 18:45:04 +0000523 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000524 file = 'test.html'
Guido van Rossum48766511996-03-28 18:45:04 +0000525
526 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000527 f = sys.stdin
Guido van Rossum48766511996-03-28 18:45:04 +0000528 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000529 try:
530 f = open(file, 'r')
531 except IOError, msg:
532 print file, ":", msg
533 sys.exit(1)
Guido van Rossum48766511996-03-28 18:45:04 +0000534
535 data = f.read()
536 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000537 f.close()
Guido van Rossum48766511996-03-28 18:45:04 +0000538
539 x = klass()
540 for c in data:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000541 x.feed(c)
Guido van Rossum48766511996-03-28 18:45:04 +0000542 x.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000543
544
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000545if __name__ == '__main__':
Guido van Rossum48766511996-03-28 18:45:04 +0000546 test()