blob: e5860247d8b1a0c5afe2da4ed805aeeecfb6c898 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""A parser for SGML, using the derived class as a static DTD."""
Guido van Rossum7c750e11995-02-27 13:16:55 +00002
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
Fred Drakefb38c762001-07-16 18:30:35 +00008# and CDATA (character data -- only end tags are special). RCDATA is
9# not supported at all.
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
11
Guido van Rossum1fef1811997-10-23 19:09:21 +000012import re
Guido van Rossum7c750e11995-02-27 13:16:55 +000013import string
14
Skip Montanaro0de65802001-02-15 22:15:14 +000015__all__ = ["SGMLParser"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000016
17# Regular expressions used for parsing
18
Guido van Rossum1fef1811997-10-23 19:09:21 +000019interesting = re.compile('[&<]')
20incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000021 '<([a-zA-Z][^<>]*|'
22 '/([a-zA-Z][^<>]*)?|'
23 '![^<>]*)?')
Guido van Rossum48766511996-03-28 18:45:04 +000024
Guido van Rossum1ad00711998-05-28 22:48:53 +000025entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Guido van Rossum1fef1811997-10-23 19:09:21 +000026charref = re.compile('&#([0-9]+)[^0-9]')
Guido van Rossum48766511996-03-28 18:45:04 +000027
Guido van Rossum1fef1811997-10-23 19:09:21 +000028starttagopen = re.compile('<[>a-zA-Z]')
Guido van Rossum5fdf8521998-08-24 20:59:13 +000029shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
Guido van Rossum1ad00711998-05-28 22:48:53 +000031piopen = re.compile('<\?')
32piclose = re.compile('>')
Guido van Rossum1fef1811997-10-23 19:09:21 +000033endtagopen = re.compile('</[<>a-zA-Z]')
34endbracket = re.compile('[<>]')
35special = re.compile('<![^<>]*>')
36commentopen = re.compile('<!--')
Fred Drake62dfed92001-03-14 16:18:56 +000037commentclose = re.compile(r'--\s*>')
Fred Drakefb38c762001-07-16 18:30:35 +000038declopen = re.compile('<!')
39declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
40declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
Fred Drakedc191632001-07-05 18:21:57 +000041tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
Guido van Rossum1fef1811997-10-23 19:09:21 +000042attrfind = re.compile(
Fred Drake8600b472001-07-14 05:50:33 +000043 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
Fred Drakedc191632001-07-05 18:21:57 +000044 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
Guido van Rossum7c750e11995-02-27 13:16:55 +000045
Guido van Rossum39d34512001-05-21 20:17:17 +000046decldata = re.compile(r'[^>\'\"]+')
Fred Drake66957372001-03-16 20:04:57 +000047declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
48
49
50class SGMLParseError(RuntimeError):
51 """Exception raised for all parse errors."""
52 pass
53
Guido van Rossum7c750e11995-02-27 13:16:55 +000054
55# SGML parser base class -- find tags and call handler functions.
56# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
57# The dtd is defined by deriving a class which defines methods
58# with special names to handle tags: start_foo and end_foo to handle
59# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
60# (Tags are converted to lower case for this purpose.) The data
61# between tags is passed to the parser by calling self.handle_data()
Jeremy Hyltona05e2932000-06-28 14:48:01 +000062# with some data as argument (the data may be split up in arbitrary
Guido van Rossum7c750e11995-02-27 13:16:55 +000063# chunks). Entity references are passed by calling
64# self.handle_entityref() with the entity reference as argument.
65
66class SGMLParser:
67
Guido van Rossum48766511996-03-28 18:45:04 +000068 def __init__(self, verbose=0):
Fred Drake08f8dd62001-07-19 20:08:04 +000069 """Initialize and reset this instance."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000070 self.verbose = verbose
71 self.reset()
Guido van Rossum7c750e11995-02-27 13:16:55 +000072
Guido van Rossum48766511996-03-28 18:45:04 +000073 def reset(self):
Fred Drake08f8dd62001-07-19 20:08:04 +000074 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000075 self.rawdata = ''
76 self.stack = []
77 self.lasttag = '???'
78 self.nomoretags = 0
79 self.literal = 0
Guido van Rossum7c750e11995-02-27 13:16:55 +000080
Guido van Rossum48766511996-03-28 18:45:04 +000081 def setnomoretags(self):
Fred Drake08f8dd62001-07-19 20:08:04 +000082 """Enter literal mode (CDATA) till EOF. Intended for derived
83 classes only."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000084 self.nomoretags = self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000085
Guido van Rossum48766511996-03-28 18:45:04 +000086 def setliteral(self, *args):
Fred Drake08f8dd62001-07-19 20:08:04 +000087 """Enter literal mode (CDATA). Intended for derived classes only."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000088 self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000089
Guido van Rossum48766511996-03-28 18:45:04 +000090 def feed(self, data):
Fred Drake08f8dd62001-07-19 20:08:04 +000091 """Feed some data to the parser. Call this as often as you
92 want, with as little or as much text as you want (may include
93 '\n'). (This just saves the text, all the processing is done
94 by goahead().)"""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000095 self.rawdata = self.rawdata + data
96 self.goahead(0)
Guido van Rossum7c750e11995-02-27 13:16:55 +000097
Guido van Rossum48766511996-03-28 18:45:04 +000098 def close(self):
Fred Drake08f8dd62001-07-19 20:08:04 +000099 """Handle the remaining data."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000100 self.goahead(1)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000101
Guido van Rossum48766511996-03-28 18:45:04 +0000102 # Internal -- handle data as far as reasonable. May leave state
103 # and data to be processed by a subsequent call. If 'end' is
104 # true, force handling all data as if followed by EOF marker.
105 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000106 rawdata = self.rawdata
107 i = 0
108 n = len(rawdata)
109 while i < n:
110 if self.nomoretags:
111 self.handle_data(rawdata[i:n])
112 i = n
113 break
114 match = interesting.search(rawdata, i)
115 if match: j = match.start(0)
116 else: j = n
117 if i < j: self.handle_data(rawdata[i:j])
118 i = j
119 if i == n: break
120 if rawdata[i] == '<':
121 if starttagopen.match(rawdata, i):
122 if self.literal:
123 self.handle_data(rawdata[i])
124 i = i+1
125 continue
126 k = self.parse_starttag(i)
127 if k < 0: break
128 i = k
129 continue
130 if endtagopen.match(rawdata, i):
131 k = self.parse_endtag(i)
132 if k < 0: break
133 i = k
134 self.literal = 0
135 continue
136 if commentopen.match(rawdata, i):
137 if self.literal:
138 self.handle_data(rawdata[i])
139 i = i+1
140 continue
141 k = self.parse_comment(i)
142 if k < 0: break
143 i = i+k
144 continue
Guido van Rossum1ad00711998-05-28 22:48:53 +0000145 if piopen.match(rawdata, i):
146 if self.literal:
147 self.handle_data(rawdata[i])
148 i = i+1
149 continue
150 k = self.parse_pi(i)
151 if k < 0: break
152 i = i+k
Tim Peters495ad3c2001-01-15 01:36:40 +0000153 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000154 match = special.match(rawdata, i)
155 if match:
156 if self.literal:
157 self.handle_data(rawdata[i])
158 i = i+1
159 continue
Fred Drake66957372001-03-16 20:04:57 +0000160 # This is some sort of declaration; in "HTML as
161 # deployed," this should only be the document type
162 # declaration ("<!DOCTYPE html...>").
163 k = self.parse_declaration(i)
164 if k < 0: break
165 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000166 continue
167 elif rawdata[i] == '&':
Fred Drakefb38c762001-07-16 18:30:35 +0000168 if self.literal:
169 self.handle_data(rawdata[i])
170 i = i+1
171 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000172 match = charref.match(rawdata, i)
173 if match:
174 name = match.group(1)
175 self.handle_charref(name)
176 i = match.end(0)
177 if rawdata[i-1] != ';': i = i-1
178 continue
179 match = entityref.match(rawdata, i)
180 if match:
181 name = match.group(1)
182 self.handle_entityref(name)
183 i = match.end(0)
184 if rawdata[i-1] != ';': i = i-1
185 continue
186 else:
Guido van Rossum74cde5b2001-04-15 13:01:41 +0000187 raise SGMLParseError('neither < nor & ??')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000188 # We get here only if incomplete matches but
189 # nothing else
190 match = incomplete.match(rawdata, i)
191 if not match:
192 self.handle_data(rawdata[i])
193 i = i+1
194 continue
195 j = match.end(0)
196 if j == n:
197 break # Really incomplete
198 self.handle_data(rawdata[i:j])
199 i = j
200 # end while
201 if end and i < n:
202 self.handle_data(rawdata[i:n])
203 i = n
204 self.rawdata = rawdata[i:]
205 # XXX if end: check for empty stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000206
Guido van Rossum48766511996-03-28 18:45:04 +0000207 # Internal -- parse comment, return length or -1 if not terminated
208 def parse_comment(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000209 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000210 if rawdata[i:i+4] != '<!--':
Fred Drake66957372001-03-16 20:04:57 +0000211 raise SGMLParseError('unexpected call to parse_comment()')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000212 match = commentclose.search(rawdata, i+4)
213 if not match:
214 return -1
215 j = match.start(0)
216 self.handle_comment(rawdata[i+4: j])
217 j = match.end(0)
218 return j-i
Guido van Rossum7c750e11995-02-27 13:16:55 +0000219
Fred Drake66957372001-03-16 20:04:57 +0000220 # Internal -- parse declaration.
221 def parse_declaration(self, i):
Fred Drakefb38c762001-07-16 18:30:35 +0000222 # This is some sort of declaration; in "HTML as
223 # deployed," this should only be the document type
224 # declaration ("<!DOCTYPE html...>").
Fred Drake66957372001-03-16 20:04:57 +0000225 rawdata = self.rawdata
226 j = i + 2
Fred Drakefb38c762001-07-16 18:30:35 +0000227 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
228 if rawdata[j:j+1] in ("-", ""):
229 # Start of comment followed by buffer boundary,
230 # or just a buffer boundary.
231 return -1
232 # in practice, this should look like: ((name|stringlit) S*)+ '>'
Guido van Rossum39d34512001-05-21 20:17:17 +0000233 n = len(rawdata)
234 while j < n:
Fred Drakefb38c762001-07-16 18:30:35 +0000235 c = rawdata[j]
Fred Drake66957372001-03-16 20:04:57 +0000236 if c == ">":
237 # end of declaration syntax
238 self.handle_decl(rawdata[i+2:j])
239 return j + 1
240 if c in "\"'":
241 m = declstringlit.match(rawdata, j)
242 if not m:
Fred Drakefb38c762001-07-16 18:30:35 +0000243 return -1 # incomplete
244 j = m.end()
245 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
246 m = declname.match(rawdata, j)
247 if not m:
248 return -1 # incomplete
Fred Drake66957372001-03-16 20:04:57 +0000249 j = m.end()
Guido van Rossum39d34512001-05-21 20:17:17 +0000250 else:
Fred Drakefb38c762001-07-16 18:30:35 +0000251 raise SGMLParseError(
252 "unexpected char in declaration: %s" % `rawdata[j]`)
Guido van Rossum39d34512001-05-21 20:17:17 +0000253 # end of buffer between tokens
254 return -1
Fred Drake66957372001-03-16 20:04:57 +0000255
Guido van Rossum1ad00711998-05-28 22:48:53 +0000256 # Internal -- parse processing instr, return length or -1 if not terminated
257 def parse_pi(self, i):
258 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000259 if rawdata[i:i+2] != '<?':
Fred Drake66957372001-03-16 20:04:57 +0000260 raise SGMLParseError('unexpected call to parse_pi()')
Guido van Rossum1ad00711998-05-28 22:48:53 +0000261 match = piclose.search(rawdata, i+2)
262 if not match:
263 return -1
264 j = match.start(0)
265 self.handle_pi(rawdata[i+2: j])
266 j = match.end(0)
267 return j-i
Fred Drakeb46696c2000-06-29 18:50:59 +0000268
269 __starttag_text = None
270 def get_starttag_text(self):
271 return self.__starttag_text
Tim Peters495ad3c2001-01-15 01:36:40 +0000272
Guido van Rossum48766511996-03-28 18:45:04 +0000273 # Internal -- handle starttag, return length or -1 if not terminated
274 def parse_starttag(self, i):
Fred Drakeb46696c2000-06-29 18:50:59 +0000275 self.__starttag_text = None
276 start_pos = i
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000277 rawdata = self.rawdata
278 if shorttagopen.match(rawdata, i):
279 # SGML shorthand: <tag/data/ == <tag>data</tag>
280 # XXX Can data contain &... (entity or char refs)?
281 # XXX Can data contain < or > (tag characters)?
282 # XXX Can there be whitespace before the first /?
283 match = shorttag.match(rawdata, i)
284 if not match:
285 return -1
286 tag, data = match.group(1, 2)
Fred Drakeb46696c2000-06-29 18:50:59 +0000287 self.__starttag_text = '<%s/' % tag
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000288 tag = tag.lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000289 k = match.end(0)
Fred Drakeb46696c2000-06-29 18:50:59 +0000290 self.finish_shorttag(tag, data)
291 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000292 return k
293 # XXX The following should skip matching quotes (' or ")
294 match = endbracket.search(rawdata, i+1)
295 if not match:
296 return -1
297 j = match.start(0)
298 # Now parse the data between i+1 and j into a tag and attrs
299 attrs = []
300 if rawdata[i:i+2] == '<>':
301 # SGML shorthand: <> == <last open tag seen>
302 k = j
303 tag = self.lasttag
304 else:
305 match = tagfind.match(rawdata, i+1)
306 if not match:
Fred Drake66957372001-03-16 20:04:57 +0000307 raise SGMLParseError('unexpected call to parse_starttag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000308 k = match.end(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000309 tag = rawdata[i+1:k].lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000310 self.lasttag = tag
311 while k < j:
312 match = attrfind.match(rawdata, k)
313 if not match: break
314 attrname, rest, attrvalue = match.group(1, 2, 3)
315 if not rest:
316 attrvalue = attrname
317 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
318 attrvalue[:1] == '"' == attrvalue[-1:]:
319 attrvalue = attrvalue[1:-1]
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000320 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000321 k = match.end(0)
322 if rawdata[j] == '>':
323 j = j+1
Fred Drakeb46696c2000-06-29 18:50:59 +0000324 self.__starttag_text = rawdata[start_pos:j]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000325 self.finish_starttag(tag, attrs)
326 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000327
328 # Internal -- parse endtag
329 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000330 rawdata = self.rawdata
331 match = endbracket.search(rawdata, i+1)
332 if not match:
333 return -1
334 j = match.start(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000335 tag = rawdata[i+2:j].strip().lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000336 if rawdata[j] == '>':
337 j = j+1
338 self.finish_endtag(tag)
339 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000340
341 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
342 def finish_shorttag(self, tag, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000343 self.finish_starttag(tag, [])
344 self.handle_data(data)
345 self.finish_endtag(tag)
Guido van Rossum48766511996-03-28 18:45:04 +0000346
347 # Internal -- finish processing of start tag
348 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
349 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000350 try:
351 method = getattr(self, 'start_' + tag)
352 except AttributeError:
353 try:
354 method = getattr(self, 'do_' + tag)
355 except AttributeError:
356 self.unknown_starttag(tag, attrs)
357 return -1
358 else:
359 self.handle_starttag(tag, method, attrs)
360 return 0
361 else:
362 self.stack.append(tag)
363 self.handle_starttag(tag, method, attrs)
364 return 1
Guido van Rossum48766511996-03-28 18:45:04 +0000365
366 # Internal -- finish processing of end tag
367 def finish_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000368 if not tag:
369 found = len(self.stack) - 1
370 if found < 0:
371 self.unknown_endtag(tag)
372 return
373 else:
374 if tag not in self.stack:
375 try:
376 method = getattr(self, 'end_' + tag)
377 except AttributeError:
378 self.unknown_endtag(tag)
Guido van Rossumb84ef9b1998-07-07 22:46:11 +0000379 else:
380 self.report_unbalanced(tag)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000381 return
382 found = len(self.stack)
383 for i in range(found):
384 if self.stack[i] == tag: found = i
385 while len(self.stack) > found:
386 tag = self.stack[-1]
387 try:
388 method = getattr(self, 'end_' + tag)
389 except AttributeError:
390 method = None
391 if method:
392 self.handle_endtag(tag, method)
393 else:
394 self.unknown_endtag(tag)
395 del self.stack[-1]
Guido van Rossum7c750e11995-02-27 13:16:55 +0000396
Guido van Rossum48766511996-03-28 18:45:04 +0000397 # Overridable -- handle start tag
398 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000399 method(attrs)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000400
Guido van Rossum48766511996-03-28 18:45:04 +0000401 # Overridable -- handle end tag
402 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000403 method()
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000404
Guido van Rossum48766511996-03-28 18:45:04 +0000405 # Example -- report an unbalanced </...> tag.
406 def report_unbalanced(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000407 if self.verbose:
408 print '*** Unbalanced </' + tag + '>'
409 print '*** Stack:', self.stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000410
Guido van Rossum48766511996-03-28 18:45:04 +0000411 def handle_charref(self, name):
Fred Drake08f8dd62001-07-19 20:08:04 +0000412 """Handle character reference, no need to override."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000413 try:
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000414 n = int(name)
Eric S. Raymond18af5642001-02-09 10:12:19 +0000415 except ValueError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000416 self.unknown_charref(name)
417 return
418 if not 0 <= n <= 255:
419 self.unknown_charref(name)
420 return
421 self.handle_data(chr(n))
Guido van Rossum7c750e11995-02-27 13:16:55 +0000422
Guido van Rossum48766511996-03-28 18:45:04 +0000423 # Definition of entities -- derived classes may override
424 entitydefs = \
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000425 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
Guido van Rossum7c750e11995-02-27 13:16:55 +0000426
Guido van Rossum48766511996-03-28 18:45:04 +0000427 def handle_entityref(self, name):
Fred Drake08f8dd62001-07-19 20:08:04 +0000428 """Handle entity references.
429
430 There should be no need to override this method; it can be
431 tailored by setting up the self.entitydefs mapping appropriately.
432 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000433 table = self.entitydefs
434 if table.has_key(name):
435 self.handle_data(table[name])
436 else:
437 self.unknown_entityref(name)
438 return
Guido van Rossum7c750e11995-02-27 13:16:55 +0000439
Guido van Rossum48766511996-03-28 18:45:04 +0000440 # Example -- handle data, should be overridden
441 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000442 pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000443
Guido van Rossum48766511996-03-28 18:45:04 +0000444 # Example -- handle comment, could be overridden
445 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000446 pass
Guido van Rossum48766511996-03-28 18:45:04 +0000447
Fred Drake66957372001-03-16 20:04:57 +0000448 # Example -- handle declaration, could be overridden
449 def handle_decl(self, decl):
450 pass
451
Guido van Rossum1ad00711998-05-28 22:48:53 +0000452 # Example -- handle processing instruction, could be overridden
453 def handle_pi(self, data):
454 pass
455
Guido van Rossum48766511996-03-28 18:45:04 +0000456 # To be overridden -- handlers for unknown objects
457 def unknown_starttag(self, tag, attrs): pass
458 def unknown_endtag(self, tag): pass
459 def unknown_charref(self, ref): pass
460 def unknown_entityref(self, ref): pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000461
462
Guido van Rossum48766511996-03-28 18:45:04 +0000463class TestSGMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +0000464
Guido van Rossum48766511996-03-28 18:45:04 +0000465 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000466 self.testdata = ""
467 SGMLParser.__init__(self, verbose)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000468
Guido van Rossum48766511996-03-28 18:45:04 +0000469 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000470 self.testdata = self.testdata + data
471 if len(`self.testdata`) >= 70:
472 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000473
Guido van Rossum48766511996-03-28 18:45:04 +0000474 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000475 data = self.testdata
476 if data:
477 self.testdata = ""
478 print 'data:', `data`
Guido van Rossum7c750e11995-02-27 13:16:55 +0000479
Guido van Rossum48766511996-03-28 18:45:04 +0000480 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000481 self.flush()
482 r = `data`
483 if len(r) > 68:
484 r = r[:32] + '...' + r[-32:]
485 print 'comment:', r
Guido van Rossum7c750e11995-02-27 13:16:55 +0000486
Guido van Rossum48766511996-03-28 18:45:04 +0000487 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000488 self.flush()
489 if not attrs:
490 print 'start tag: <' + tag + '>'
491 else:
492 print 'start tag: <' + tag,
493 for name, value in attrs:
494 print name + '=' + '"' + value + '"',
495 print '>'
Guido van Rossum7c750e11995-02-27 13:16:55 +0000496
Guido van Rossum48766511996-03-28 18:45:04 +0000497 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000498 self.flush()
499 print 'end tag: </' + tag + '>'
Guido van Rossum48766511996-03-28 18:45:04 +0000500
501 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000502 self.flush()
503 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000504
505 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000506 self.flush()
507 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000508
509 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000510 SGMLParser.close(self)
511 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000512
513
Guido van Rossum48766511996-03-28 18:45:04 +0000514def test(args = None):
515 import sys
516
517 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000518 args = sys.argv[1:]
Guido van Rossum48766511996-03-28 18:45:04 +0000519
520 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000521 args = args[1:]
522 klass = SGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000523 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000524 klass = TestSGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000525
526 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000527 file = args[0]
Guido van Rossum48766511996-03-28 18:45:04 +0000528 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000529 file = 'test.html'
Guido van Rossum48766511996-03-28 18:45:04 +0000530
531 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000532 f = sys.stdin
Guido van Rossum48766511996-03-28 18:45:04 +0000533 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000534 try:
535 f = open(file, 'r')
536 except IOError, msg:
537 print file, ":", msg
538 sys.exit(1)
Guido van Rossum48766511996-03-28 18:45:04 +0000539
540 data = f.read()
541 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000542 f.close()
Guido van Rossum48766511996-03-28 18:45:04 +0000543
544 x = klass()
545 for c in data:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000546 x.feed(c)
Guido van Rossum48766511996-03-28 18:45:04 +0000547 x.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000548
549
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000550if __name__ == '__main__':
Guido van Rossum48766511996-03-28 18:45:04 +0000551 test()