blob: f228e5b2ff8d3fc041fa6b635d2e146abcb15948 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""A parser for SGML, using the derived class as a static DTD."""
Guido van Rossum7c750e11995-02-27 13:16:55 +00002
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
Fred Drakefb38c762001-07-16 18:30:35 +00008# and CDATA (character data -- only end tags are special). RCDATA is
9# not supported at all.
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
11
Guido van Rossum1fef1811997-10-23 19:09:21 +000012import re
Guido van Rossum7c750e11995-02-27 13:16:55 +000013import string
14
Skip Montanaro0de65802001-02-15 22:15:14 +000015__all__ = ["SGMLParser"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000016
17# Regular expressions used for parsing
18
Guido van Rossum1fef1811997-10-23 19:09:21 +000019interesting = re.compile('[&<]')
20incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000021 '<([a-zA-Z][^<>]*|'
22 '/([a-zA-Z][^<>]*)?|'
23 '![^<>]*)?')
Guido van Rossum48766511996-03-28 18:45:04 +000024
Guido van Rossum1ad00711998-05-28 22:48:53 +000025entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Guido van Rossum1fef1811997-10-23 19:09:21 +000026charref = re.compile('&#([0-9]+)[^0-9]')
Guido van Rossum48766511996-03-28 18:45:04 +000027
Guido van Rossum1fef1811997-10-23 19:09:21 +000028starttagopen = re.compile('<[>a-zA-Z]')
Guido van Rossum5fdf8521998-08-24 20:59:13 +000029shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
Guido van Rossum1ad00711998-05-28 22:48:53 +000031piopen = re.compile('<\?')
32piclose = re.compile('>')
Guido van Rossum1fef1811997-10-23 19:09:21 +000033endtagopen = re.compile('</[<>a-zA-Z]')
34endbracket = re.compile('[<>]')
35special = re.compile('<![^<>]*>')
36commentopen = re.compile('<!--')
Fred Drake62dfed92001-03-14 16:18:56 +000037commentclose = re.compile(r'--\s*>')
Fred Drakefb38c762001-07-16 18:30:35 +000038declopen = re.compile('<!')
39declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
40declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
Fred Drakedc191632001-07-05 18:21:57 +000041tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
Guido van Rossum1fef1811997-10-23 19:09:21 +000042attrfind = re.compile(
Fred Drake8600b472001-07-14 05:50:33 +000043 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
Fred Drakedc191632001-07-05 18:21:57 +000044 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
Guido van Rossum7c750e11995-02-27 13:16:55 +000045
Guido van Rossum39d34512001-05-21 20:17:17 +000046decldata = re.compile(r'[^>\'\"]+')
Fred Drake66957372001-03-16 20:04:57 +000047declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
48
49
50class SGMLParseError(RuntimeError):
51 """Exception raised for all parse errors."""
52 pass
53
Guido van Rossum7c750e11995-02-27 13:16:55 +000054
55# SGML parser base class -- find tags and call handler functions.
56# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
57# The dtd is defined by deriving a class which defines methods
58# with special names to handle tags: start_foo and end_foo to handle
59# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
60# (Tags are converted to lower case for this purpose.) The data
61# between tags is passed to the parser by calling self.handle_data()
Jeremy Hyltona05e2932000-06-28 14:48:01 +000062# with some data as argument (the data may be split up in arbitrary
Guido van Rossum7c750e11995-02-27 13:16:55 +000063# chunks). Entity references are passed by calling
64# self.handle_entityref() with the entity reference as argument.
65
66class SGMLParser:
67
Guido van Rossum48766511996-03-28 18:45:04 +000068 def __init__(self, verbose=0):
Fred Drake08f8dd62001-07-19 20:08:04 +000069 """Initialize and reset this instance."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000070 self.verbose = verbose
71 self.reset()
Guido van Rossum7c750e11995-02-27 13:16:55 +000072
Guido van Rossum48766511996-03-28 18:45:04 +000073 def reset(self):
Fred Drake08f8dd62001-07-19 20:08:04 +000074 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000075 self.rawdata = ''
76 self.stack = []
77 self.lasttag = '???'
78 self.nomoretags = 0
79 self.literal = 0
Guido van Rossum7c750e11995-02-27 13:16:55 +000080
Guido van Rossum48766511996-03-28 18:45:04 +000081 def setnomoretags(self):
Fred Drake390e9db2001-07-19 20:57:23 +000082 """Enter literal mode (CDATA) till EOF.
83
84 Intended for derived classes only.
85 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000086 self.nomoretags = self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000087
Guido van Rossum48766511996-03-28 18:45:04 +000088 def setliteral(self, *args):
Fred Drake390e9db2001-07-19 20:57:23 +000089 """Enter literal mode (CDATA).
90
91 Intended for derived classes only.
92 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000093 self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000094
Guido van Rossum48766511996-03-28 18:45:04 +000095 def feed(self, data):
Fred Drake390e9db2001-07-19 20:57:23 +000096 """Feed some data to the parser.
97
98 Call this as often as you want, with as little or as much text
99 as you want (may include '\n'). (This just saves the text,
100 all the processing is done by goahead().)
101 """
102
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000103 self.rawdata = self.rawdata + data
104 self.goahead(0)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000105
Guido van Rossum48766511996-03-28 18:45:04 +0000106 def close(self):
Fred Drake08f8dd62001-07-19 20:08:04 +0000107 """Handle the remaining data."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000108 self.goahead(1)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000109
Guido van Rossum48766511996-03-28 18:45:04 +0000110 # Internal -- handle data as far as reasonable. May leave state
111 # and data to be processed by a subsequent call. If 'end' is
112 # true, force handling all data as if followed by EOF marker.
113 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000114 rawdata = self.rawdata
115 i = 0
116 n = len(rawdata)
117 while i < n:
118 if self.nomoretags:
119 self.handle_data(rawdata[i:n])
120 i = n
121 break
122 match = interesting.search(rawdata, i)
123 if match: j = match.start(0)
124 else: j = n
125 if i < j: self.handle_data(rawdata[i:j])
126 i = j
127 if i == n: break
128 if rawdata[i] == '<':
129 if starttagopen.match(rawdata, i):
130 if self.literal:
131 self.handle_data(rawdata[i])
132 i = i+1
133 continue
134 k = self.parse_starttag(i)
135 if k < 0: break
136 i = k
137 continue
138 if endtagopen.match(rawdata, i):
139 k = self.parse_endtag(i)
140 if k < 0: break
141 i = k
142 self.literal = 0
143 continue
144 if commentopen.match(rawdata, i):
145 if self.literal:
146 self.handle_data(rawdata[i])
147 i = i+1
148 continue
149 k = self.parse_comment(i)
150 if k < 0: break
151 i = i+k
152 continue
Guido van Rossum1ad00711998-05-28 22:48:53 +0000153 if piopen.match(rawdata, i):
154 if self.literal:
155 self.handle_data(rawdata[i])
156 i = i+1
157 continue
158 k = self.parse_pi(i)
159 if k < 0: break
160 i = i+k
Tim Peters495ad3c2001-01-15 01:36:40 +0000161 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000162 match = special.match(rawdata, i)
163 if match:
164 if self.literal:
165 self.handle_data(rawdata[i])
166 i = i+1
167 continue
Fred Drake66957372001-03-16 20:04:57 +0000168 # This is some sort of declaration; in "HTML as
169 # deployed," this should only be the document type
170 # declaration ("<!DOCTYPE html...>").
171 k = self.parse_declaration(i)
172 if k < 0: break
173 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000174 continue
175 elif rawdata[i] == '&':
Fred Drakefb38c762001-07-16 18:30:35 +0000176 if self.literal:
177 self.handle_data(rawdata[i])
178 i = i+1
179 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000180 match = charref.match(rawdata, i)
181 if match:
182 name = match.group(1)
183 self.handle_charref(name)
184 i = match.end(0)
185 if rawdata[i-1] != ';': i = i-1
186 continue
187 match = entityref.match(rawdata, i)
188 if match:
189 name = match.group(1)
190 self.handle_entityref(name)
191 i = match.end(0)
192 if rawdata[i-1] != ';': i = i-1
193 continue
194 else:
Guido van Rossum74cde5b2001-04-15 13:01:41 +0000195 raise SGMLParseError('neither < nor & ??')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000196 # We get here only if incomplete matches but
197 # nothing else
198 match = incomplete.match(rawdata, i)
199 if not match:
200 self.handle_data(rawdata[i])
201 i = i+1
202 continue
203 j = match.end(0)
204 if j == n:
205 break # Really incomplete
206 self.handle_data(rawdata[i:j])
207 i = j
208 # end while
209 if end and i < n:
210 self.handle_data(rawdata[i:n])
211 i = n
212 self.rawdata = rawdata[i:]
213 # XXX if end: check for empty stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000214
Guido van Rossum48766511996-03-28 18:45:04 +0000215 # Internal -- parse comment, return length or -1 if not terminated
216 def parse_comment(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000217 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000218 if rawdata[i:i+4] != '<!--':
Fred Drake66957372001-03-16 20:04:57 +0000219 raise SGMLParseError('unexpected call to parse_comment()')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000220 match = commentclose.search(rawdata, i+4)
221 if not match:
222 return -1
223 j = match.start(0)
224 self.handle_comment(rawdata[i+4: j])
225 j = match.end(0)
226 return j-i
Guido van Rossum7c750e11995-02-27 13:16:55 +0000227
Fred Drake66957372001-03-16 20:04:57 +0000228 # Internal -- parse declaration.
229 def parse_declaration(self, i):
Fred Drakefb38c762001-07-16 18:30:35 +0000230 # This is some sort of declaration; in "HTML as
231 # deployed," this should only be the document type
232 # declaration ("<!DOCTYPE html...>").
Fred Drake66957372001-03-16 20:04:57 +0000233 rawdata = self.rawdata
234 j = i + 2
Fred Drakefb38c762001-07-16 18:30:35 +0000235 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
236 if rawdata[j:j+1] in ("-", ""):
237 # Start of comment followed by buffer boundary,
238 # or just a buffer boundary.
239 return -1
240 # in practice, this should look like: ((name|stringlit) S*)+ '>'
Guido van Rossum39d34512001-05-21 20:17:17 +0000241 n = len(rawdata)
242 while j < n:
Fred Drakefb38c762001-07-16 18:30:35 +0000243 c = rawdata[j]
Fred Drake66957372001-03-16 20:04:57 +0000244 if c == ">":
245 # end of declaration syntax
246 self.handle_decl(rawdata[i+2:j])
247 return j + 1
248 if c in "\"'":
249 m = declstringlit.match(rawdata, j)
250 if not m:
Fred Drakefb38c762001-07-16 18:30:35 +0000251 return -1 # incomplete
252 j = m.end()
253 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
254 m = declname.match(rawdata, j)
255 if not m:
256 return -1 # incomplete
Fred Drake66957372001-03-16 20:04:57 +0000257 j = m.end()
Guido van Rossum39d34512001-05-21 20:17:17 +0000258 else:
Fred Drakefb38c762001-07-16 18:30:35 +0000259 raise SGMLParseError(
260 "unexpected char in declaration: %s" % `rawdata[j]`)
Guido van Rossum39d34512001-05-21 20:17:17 +0000261 # end of buffer between tokens
262 return -1
Fred Drake66957372001-03-16 20:04:57 +0000263
Guido van Rossum1ad00711998-05-28 22:48:53 +0000264 # Internal -- parse processing instr, return length or -1 if not terminated
265 def parse_pi(self, i):
266 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000267 if rawdata[i:i+2] != '<?':
Fred Drake66957372001-03-16 20:04:57 +0000268 raise SGMLParseError('unexpected call to parse_pi()')
Guido van Rossum1ad00711998-05-28 22:48:53 +0000269 match = piclose.search(rawdata, i+2)
270 if not match:
271 return -1
272 j = match.start(0)
273 self.handle_pi(rawdata[i+2: j])
274 j = match.end(0)
275 return j-i
Fred Drakeb46696c2000-06-29 18:50:59 +0000276
277 __starttag_text = None
278 def get_starttag_text(self):
279 return self.__starttag_text
Tim Peters495ad3c2001-01-15 01:36:40 +0000280
Guido van Rossum48766511996-03-28 18:45:04 +0000281 # Internal -- handle starttag, return length or -1 if not terminated
282 def parse_starttag(self, i):
Fred Drakeb46696c2000-06-29 18:50:59 +0000283 self.__starttag_text = None
284 start_pos = i
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000285 rawdata = self.rawdata
286 if shorttagopen.match(rawdata, i):
287 # SGML shorthand: <tag/data/ == <tag>data</tag>
288 # XXX Can data contain &... (entity or char refs)?
289 # XXX Can data contain < or > (tag characters)?
290 # XXX Can there be whitespace before the first /?
291 match = shorttag.match(rawdata, i)
292 if not match:
293 return -1
294 tag, data = match.group(1, 2)
Fred Drakeb46696c2000-06-29 18:50:59 +0000295 self.__starttag_text = '<%s/' % tag
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000296 tag = tag.lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000297 k = match.end(0)
Fred Drakeb46696c2000-06-29 18:50:59 +0000298 self.finish_shorttag(tag, data)
299 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000300 return k
301 # XXX The following should skip matching quotes (' or ")
302 match = endbracket.search(rawdata, i+1)
303 if not match:
304 return -1
305 j = match.start(0)
306 # Now parse the data between i+1 and j into a tag and attrs
307 attrs = []
308 if rawdata[i:i+2] == '<>':
309 # SGML shorthand: <> == <last open tag seen>
310 k = j
311 tag = self.lasttag
312 else:
313 match = tagfind.match(rawdata, i+1)
314 if not match:
Fred Drake66957372001-03-16 20:04:57 +0000315 raise SGMLParseError('unexpected call to parse_starttag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000316 k = match.end(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000317 tag = rawdata[i+1:k].lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000318 self.lasttag = tag
319 while k < j:
320 match = attrfind.match(rawdata, k)
321 if not match: break
322 attrname, rest, attrvalue = match.group(1, 2, 3)
323 if not rest:
324 attrvalue = attrname
325 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
326 attrvalue[:1] == '"' == attrvalue[-1:]:
327 attrvalue = attrvalue[1:-1]
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000328 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000329 k = match.end(0)
330 if rawdata[j] == '>':
331 j = j+1
Fred Drakeb46696c2000-06-29 18:50:59 +0000332 self.__starttag_text = rawdata[start_pos:j]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000333 self.finish_starttag(tag, attrs)
334 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000335
336 # Internal -- parse endtag
337 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000338 rawdata = self.rawdata
339 match = endbracket.search(rawdata, i+1)
340 if not match:
341 return -1
342 j = match.start(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000343 tag = rawdata[i+2:j].strip().lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000344 if rawdata[j] == '>':
345 j = j+1
346 self.finish_endtag(tag)
347 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000348
349 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
350 def finish_shorttag(self, tag, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000351 self.finish_starttag(tag, [])
352 self.handle_data(data)
353 self.finish_endtag(tag)
Guido van Rossum48766511996-03-28 18:45:04 +0000354
355 # Internal -- finish processing of start tag
356 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
357 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000358 try:
359 method = getattr(self, 'start_' + tag)
360 except AttributeError:
361 try:
362 method = getattr(self, 'do_' + tag)
363 except AttributeError:
364 self.unknown_starttag(tag, attrs)
365 return -1
366 else:
367 self.handle_starttag(tag, method, attrs)
368 return 0
369 else:
370 self.stack.append(tag)
371 self.handle_starttag(tag, method, attrs)
372 return 1
Guido van Rossum48766511996-03-28 18:45:04 +0000373
374 # Internal -- finish processing of end tag
375 def finish_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000376 if not tag:
377 found = len(self.stack) - 1
378 if found < 0:
379 self.unknown_endtag(tag)
380 return
381 else:
382 if tag not in self.stack:
383 try:
384 method = getattr(self, 'end_' + tag)
385 except AttributeError:
386 self.unknown_endtag(tag)
Guido van Rossumb84ef9b1998-07-07 22:46:11 +0000387 else:
388 self.report_unbalanced(tag)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000389 return
390 found = len(self.stack)
391 for i in range(found):
392 if self.stack[i] == tag: found = i
393 while len(self.stack) > found:
394 tag = self.stack[-1]
395 try:
396 method = getattr(self, 'end_' + tag)
397 except AttributeError:
398 method = None
399 if method:
400 self.handle_endtag(tag, method)
401 else:
402 self.unknown_endtag(tag)
403 del self.stack[-1]
Guido van Rossum7c750e11995-02-27 13:16:55 +0000404
Guido van Rossum48766511996-03-28 18:45:04 +0000405 # Overridable -- handle start tag
406 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000407 method(attrs)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000408
Guido van Rossum48766511996-03-28 18:45:04 +0000409 # Overridable -- handle end tag
410 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000411 method()
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000412
Guido van Rossum48766511996-03-28 18:45:04 +0000413 # Example -- report an unbalanced </...> tag.
414 def report_unbalanced(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000415 if self.verbose:
416 print '*** Unbalanced </' + tag + '>'
417 print '*** Stack:', self.stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000418
Guido van Rossum48766511996-03-28 18:45:04 +0000419 def handle_charref(self, name):
Fred Drake08f8dd62001-07-19 20:08:04 +0000420 """Handle character reference, no need to override."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000421 try:
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000422 n = int(name)
Eric S. Raymond18af5642001-02-09 10:12:19 +0000423 except ValueError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000424 self.unknown_charref(name)
425 return
426 if not 0 <= n <= 255:
427 self.unknown_charref(name)
428 return
429 self.handle_data(chr(n))
Guido van Rossum7c750e11995-02-27 13:16:55 +0000430
Guido van Rossum48766511996-03-28 18:45:04 +0000431 # Definition of entities -- derived classes may override
432 entitydefs = \
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000433 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
Guido van Rossum7c750e11995-02-27 13:16:55 +0000434
Guido van Rossum48766511996-03-28 18:45:04 +0000435 def handle_entityref(self, name):
Fred Drake08f8dd62001-07-19 20:08:04 +0000436 """Handle entity references.
437
438 There should be no need to override this method; it can be
439 tailored by setting up the self.entitydefs mapping appropriately.
440 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000441 table = self.entitydefs
442 if table.has_key(name):
443 self.handle_data(table[name])
444 else:
445 self.unknown_entityref(name)
446 return
Guido van Rossum7c750e11995-02-27 13:16:55 +0000447
Guido van Rossum48766511996-03-28 18:45:04 +0000448 # Example -- handle data, should be overridden
449 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000450 pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000451
Guido van Rossum48766511996-03-28 18:45:04 +0000452 # Example -- handle comment, could be overridden
453 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000454 pass
Guido van Rossum48766511996-03-28 18:45:04 +0000455
Fred Drake66957372001-03-16 20:04:57 +0000456 # Example -- handle declaration, could be overridden
457 def handle_decl(self, decl):
458 pass
459
Guido van Rossum1ad00711998-05-28 22:48:53 +0000460 # Example -- handle processing instruction, could be overridden
461 def handle_pi(self, data):
462 pass
463
Guido van Rossum48766511996-03-28 18:45:04 +0000464 # To be overridden -- handlers for unknown objects
465 def unknown_starttag(self, tag, attrs): pass
466 def unknown_endtag(self, tag): pass
467 def unknown_charref(self, ref): pass
468 def unknown_entityref(self, ref): pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000469
470
Guido van Rossum48766511996-03-28 18:45:04 +0000471class TestSGMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +0000472
Guido van Rossum48766511996-03-28 18:45:04 +0000473 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000474 self.testdata = ""
475 SGMLParser.__init__(self, verbose)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000476
Guido van Rossum48766511996-03-28 18:45:04 +0000477 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000478 self.testdata = self.testdata + data
479 if len(`self.testdata`) >= 70:
480 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000481
Guido van Rossum48766511996-03-28 18:45:04 +0000482 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000483 data = self.testdata
484 if data:
485 self.testdata = ""
486 print 'data:', `data`
Guido van Rossum7c750e11995-02-27 13:16:55 +0000487
Guido van Rossum48766511996-03-28 18:45:04 +0000488 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000489 self.flush()
490 r = `data`
491 if len(r) > 68:
492 r = r[:32] + '...' + r[-32:]
493 print 'comment:', r
Guido van Rossum7c750e11995-02-27 13:16:55 +0000494
Guido van Rossum48766511996-03-28 18:45:04 +0000495 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000496 self.flush()
497 if not attrs:
498 print 'start tag: <' + tag + '>'
499 else:
500 print 'start tag: <' + tag,
501 for name, value in attrs:
502 print name + '=' + '"' + value + '"',
503 print '>'
Guido van Rossum7c750e11995-02-27 13:16:55 +0000504
Guido van Rossum48766511996-03-28 18:45:04 +0000505 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000506 self.flush()
507 print 'end tag: </' + tag + '>'
Guido van Rossum48766511996-03-28 18:45:04 +0000508
509 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000510 self.flush()
511 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000512
513 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000514 self.flush()
515 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000516
517 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000518 SGMLParser.close(self)
519 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000520
521
Guido van Rossum48766511996-03-28 18:45:04 +0000522def test(args = None):
523 import sys
524
525 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000526 args = sys.argv[1:]
Guido van Rossum48766511996-03-28 18:45:04 +0000527
528 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000529 args = args[1:]
530 klass = SGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000531 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000532 klass = TestSGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000533
534 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000535 file = args[0]
Guido van Rossum48766511996-03-28 18:45:04 +0000536 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000537 file = 'test.html'
Guido van Rossum48766511996-03-28 18:45:04 +0000538
539 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000540 f = sys.stdin
Guido van Rossum48766511996-03-28 18:45:04 +0000541 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000542 try:
543 f = open(file, 'r')
544 except IOError, msg:
545 print file, ":", msg
546 sys.exit(1)
Guido van Rossum48766511996-03-28 18:45:04 +0000547
548 data = f.read()
549 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000550 f.close()
Guido van Rossum48766511996-03-28 18:45:04 +0000551
552 x = klass()
553 for c in data:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000554 x.feed(c)
Guido van Rossum48766511996-03-28 18:45:04 +0000555 x.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000556
557
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000558if __name__ == '__main__':
Guido van Rossum48766511996-03-28 18:45:04 +0000559 test()