blob: f2a302095b633eef745349a447149ad2b461b1fb [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""A parser for SGML, using the derived class as a static DTD."""
Guido van Rossum7c750e11995-02-27 13:16:55 +00002
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
Fred Drakefb38c762001-07-16 18:30:35 +00008# and CDATA (character data -- only end tags are special). RCDATA is
9# not supported at all.
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
11
Guido van Rossum1fef1811997-10-23 19:09:21 +000012import re
Guido van Rossum7c750e11995-02-27 13:16:55 +000013
Skip Montanaro0de65802001-02-15 22:15:14 +000014__all__ = ["SGMLParser"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000015
16# Regular expressions used for parsing
17
Guido van Rossum1fef1811997-10-23 19:09:21 +000018interesting = re.compile('[&<]')
19incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000020 '<([a-zA-Z][^<>]*|'
21 '/([a-zA-Z][^<>]*)?|'
22 '![^<>]*)?')
Guido van Rossum48766511996-03-28 18:45:04 +000023
Guido van Rossum1ad00711998-05-28 22:48:53 +000024entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Guido van Rossum1fef1811997-10-23 19:09:21 +000025charref = re.compile('&#([0-9]+)[^0-9]')
Guido van Rossum48766511996-03-28 18:45:04 +000026
Guido van Rossum1fef1811997-10-23 19:09:21 +000027starttagopen = re.compile('<[>a-zA-Z]')
Guido van Rossum5fdf8521998-08-24 20:59:13 +000028shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
29shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
Guido van Rossum1ad00711998-05-28 22:48:53 +000030piopen = re.compile('<\?')
31piclose = re.compile('>')
Guido van Rossum1fef1811997-10-23 19:09:21 +000032endtagopen = re.compile('</[<>a-zA-Z]')
33endbracket = re.compile('[<>]')
34special = re.compile('<![^<>]*>')
35commentopen = re.compile('<!--')
Fred Drake62dfed92001-03-14 16:18:56 +000036commentclose = re.compile(r'--\s*>')
Fred Drakefb38c762001-07-16 18:30:35 +000037declopen = re.compile('<!')
38declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
39declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
Fred Drakedc191632001-07-05 18:21:57 +000040tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
Guido van Rossum1fef1811997-10-23 19:09:21 +000041attrfind = re.compile(
Fred Drake8600b472001-07-14 05:50:33 +000042 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
Fred Drakedc191632001-07-05 18:21:57 +000043 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~\'"]*))?')
Guido van Rossum7c750e11995-02-27 13:16:55 +000044
Guido van Rossum39d34512001-05-21 20:17:17 +000045decldata = re.compile(r'[^>\'\"]+')
Fred Drake66957372001-03-16 20:04:57 +000046declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
47
48
49class SGMLParseError(RuntimeError):
50 """Exception raised for all parse errors."""
51 pass
52
Guido van Rossum7c750e11995-02-27 13:16:55 +000053
54# SGML parser base class -- find tags and call handler functions.
55# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
56# The dtd is defined by deriving a class which defines methods
57# with special names to handle tags: start_foo and end_foo to handle
58# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
59# (Tags are converted to lower case for this purpose.) The data
60# between tags is passed to the parser by calling self.handle_data()
Jeremy Hyltona05e2932000-06-28 14:48:01 +000061# with some data as argument (the data may be split up in arbitrary
Guido van Rossum7c750e11995-02-27 13:16:55 +000062# chunks). Entity references are passed by calling
63# self.handle_entityref() with the entity reference as argument.
64
65class SGMLParser:
66
Guido van Rossum48766511996-03-28 18:45:04 +000067 def __init__(self, verbose=0):
Fred Drake08f8dd62001-07-19 20:08:04 +000068 """Initialize and reset this instance."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000069 self.verbose = verbose
70 self.reset()
Guido van Rossum7c750e11995-02-27 13:16:55 +000071
Guido van Rossum48766511996-03-28 18:45:04 +000072 def reset(self):
Fred Drake08f8dd62001-07-19 20:08:04 +000073 """Reset this instance. Loses all unprocessed data."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000074 self.rawdata = ''
75 self.stack = []
76 self.lasttag = '???'
77 self.nomoretags = 0
78 self.literal = 0
Guido van Rossum7c750e11995-02-27 13:16:55 +000079
Guido van Rossum48766511996-03-28 18:45:04 +000080 def setnomoretags(self):
Fred Drake390e9db2001-07-19 20:57:23 +000081 """Enter literal mode (CDATA) till EOF.
82
83 Intended for derived classes only.
84 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000085 self.nomoretags = self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000086
Guido van Rossum48766511996-03-28 18:45:04 +000087 def setliteral(self, *args):
Fred Drake390e9db2001-07-19 20:57:23 +000088 """Enter literal mode (CDATA).
89
90 Intended for derived classes only.
91 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000092 self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000093
Guido van Rossum48766511996-03-28 18:45:04 +000094 def feed(self, data):
Fred Drake390e9db2001-07-19 20:57:23 +000095 """Feed some data to the parser.
96
97 Call this as often as you want, with as little or as much text
98 as you want (may include '\n'). (This just saves the text,
99 all the processing is done by goahead().)
100 """
101
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000102 self.rawdata = self.rawdata + data
103 self.goahead(0)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000104
Guido van Rossum48766511996-03-28 18:45:04 +0000105 def close(self):
Fred Drake08f8dd62001-07-19 20:08:04 +0000106 """Handle the remaining data."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000107 self.goahead(1)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000108
Guido van Rossum48766511996-03-28 18:45:04 +0000109 # Internal -- handle data as far as reasonable. May leave state
110 # and data to be processed by a subsequent call. If 'end' is
111 # true, force handling all data as if followed by EOF marker.
112 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000113 rawdata = self.rawdata
114 i = 0
115 n = len(rawdata)
116 while i < n:
117 if self.nomoretags:
118 self.handle_data(rawdata[i:n])
119 i = n
120 break
121 match = interesting.search(rawdata, i)
122 if match: j = match.start(0)
123 else: j = n
124 if i < j: self.handle_data(rawdata[i:j])
125 i = j
126 if i == n: break
127 if rawdata[i] == '<':
128 if starttagopen.match(rawdata, i):
129 if self.literal:
130 self.handle_data(rawdata[i])
131 i = i+1
132 continue
133 k = self.parse_starttag(i)
134 if k < 0: break
135 i = k
136 continue
137 if endtagopen.match(rawdata, i):
138 k = self.parse_endtag(i)
139 if k < 0: break
140 i = k
141 self.literal = 0
142 continue
143 if commentopen.match(rawdata, i):
144 if self.literal:
145 self.handle_data(rawdata[i])
146 i = i+1
147 continue
148 k = self.parse_comment(i)
149 if k < 0: break
150 i = i+k
151 continue
Guido van Rossum1ad00711998-05-28 22:48:53 +0000152 if piopen.match(rawdata, i):
153 if self.literal:
154 self.handle_data(rawdata[i])
155 i = i+1
156 continue
157 k = self.parse_pi(i)
158 if k < 0: break
159 i = i+k
Tim Peters495ad3c2001-01-15 01:36:40 +0000160 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000161 match = special.match(rawdata, i)
162 if match:
163 if self.literal:
164 self.handle_data(rawdata[i])
165 i = i+1
166 continue
Fred Drake66957372001-03-16 20:04:57 +0000167 # This is some sort of declaration; in "HTML as
168 # deployed," this should only be the document type
169 # declaration ("<!DOCTYPE html...>").
170 k = self.parse_declaration(i)
171 if k < 0: break
172 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000173 continue
174 elif rawdata[i] == '&':
Fred Drakefb38c762001-07-16 18:30:35 +0000175 if self.literal:
176 self.handle_data(rawdata[i])
177 i = i+1
178 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000179 match = charref.match(rawdata, i)
180 if match:
181 name = match.group(1)
182 self.handle_charref(name)
183 i = match.end(0)
184 if rawdata[i-1] != ';': i = i-1
185 continue
186 match = entityref.match(rawdata, i)
187 if match:
188 name = match.group(1)
189 self.handle_entityref(name)
190 i = match.end(0)
191 if rawdata[i-1] != ';': i = i-1
192 continue
193 else:
Guido van Rossum74cde5b2001-04-15 13:01:41 +0000194 raise SGMLParseError('neither < nor & ??')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000195 # We get here only if incomplete matches but
196 # nothing else
197 match = incomplete.match(rawdata, i)
198 if not match:
199 self.handle_data(rawdata[i])
200 i = i+1
201 continue
202 j = match.end(0)
203 if j == n:
204 break # Really incomplete
205 self.handle_data(rawdata[i:j])
206 i = j
207 # end while
208 if end and i < n:
209 self.handle_data(rawdata[i:n])
210 i = n
211 self.rawdata = rawdata[i:]
212 # XXX if end: check for empty stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000213
Guido van Rossum48766511996-03-28 18:45:04 +0000214 # Internal -- parse comment, return length or -1 if not terminated
215 def parse_comment(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000216 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000217 if rawdata[i:i+4] != '<!--':
Fred Drake66957372001-03-16 20:04:57 +0000218 raise SGMLParseError('unexpected call to parse_comment()')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000219 match = commentclose.search(rawdata, i+4)
220 if not match:
221 return -1
222 j = match.start(0)
223 self.handle_comment(rawdata[i+4: j])
224 j = match.end(0)
225 return j-i
Guido van Rossum7c750e11995-02-27 13:16:55 +0000226
Fred Drake66957372001-03-16 20:04:57 +0000227 # Internal -- parse declaration.
228 def parse_declaration(self, i):
Fred Drakefb38c762001-07-16 18:30:35 +0000229 # This is some sort of declaration; in "HTML as
230 # deployed," this should only be the document type
231 # declaration ("<!DOCTYPE html...>").
Fred Drake66957372001-03-16 20:04:57 +0000232 rawdata = self.rawdata
233 j = i + 2
Fred Drakefb38c762001-07-16 18:30:35 +0000234 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
235 if rawdata[j:j+1] in ("-", ""):
236 # Start of comment followed by buffer boundary,
237 # or just a buffer boundary.
238 return -1
239 # in practice, this should look like: ((name|stringlit) S*)+ '>'
Guido van Rossum39d34512001-05-21 20:17:17 +0000240 n = len(rawdata)
241 while j < n:
Fred Drakefb38c762001-07-16 18:30:35 +0000242 c = rawdata[j]
Fred Drake66957372001-03-16 20:04:57 +0000243 if c == ">":
244 # end of declaration syntax
245 self.handle_decl(rawdata[i+2:j])
246 return j + 1
247 if c in "\"'":
248 m = declstringlit.match(rawdata, j)
249 if not m:
Fred Drakefb38c762001-07-16 18:30:35 +0000250 return -1 # incomplete
251 j = m.end()
252 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
253 m = declname.match(rawdata, j)
254 if not m:
255 return -1 # incomplete
Fred Drake66957372001-03-16 20:04:57 +0000256 j = m.end()
Guido van Rossum39d34512001-05-21 20:17:17 +0000257 else:
Fred Drakefb38c762001-07-16 18:30:35 +0000258 raise SGMLParseError(
259 "unexpected char in declaration: %s" % `rawdata[j]`)
Guido van Rossum39d34512001-05-21 20:17:17 +0000260 # end of buffer between tokens
261 return -1
Fred Drake66957372001-03-16 20:04:57 +0000262
Guido van Rossum1ad00711998-05-28 22:48:53 +0000263 # Internal -- parse processing instr, return length or -1 if not terminated
264 def parse_pi(self, i):
265 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000266 if rawdata[i:i+2] != '<?':
Fred Drake66957372001-03-16 20:04:57 +0000267 raise SGMLParseError('unexpected call to parse_pi()')
Guido van Rossum1ad00711998-05-28 22:48:53 +0000268 match = piclose.search(rawdata, i+2)
269 if not match:
270 return -1
271 j = match.start(0)
272 self.handle_pi(rawdata[i+2: j])
273 j = match.end(0)
274 return j-i
Fred Drakeb46696c2000-06-29 18:50:59 +0000275
276 __starttag_text = None
277 def get_starttag_text(self):
278 return self.__starttag_text
Tim Peters495ad3c2001-01-15 01:36:40 +0000279
Guido van Rossum48766511996-03-28 18:45:04 +0000280 # Internal -- handle starttag, return length or -1 if not terminated
281 def parse_starttag(self, i):
Fred Drakeb46696c2000-06-29 18:50:59 +0000282 self.__starttag_text = None
283 start_pos = i
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000284 rawdata = self.rawdata
285 if shorttagopen.match(rawdata, i):
286 # SGML shorthand: <tag/data/ == <tag>data</tag>
287 # XXX Can data contain &... (entity or char refs)?
288 # XXX Can data contain < or > (tag characters)?
289 # XXX Can there be whitespace before the first /?
290 match = shorttag.match(rawdata, i)
291 if not match:
292 return -1
293 tag, data = match.group(1, 2)
Fred Drakeb46696c2000-06-29 18:50:59 +0000294 self.__starttag_text = '<%s/' % tag
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000295 tag = tag.lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000296 k = match.end(0)
Fred Drakeb46696c2000-06-29 18:50:59 +0000297 self.finish_shorttag(tag, data)
298 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000299 return k
300 # XXX The following should skip matching quotes (' or ")
301 match = endbracket.search(rawdata, i+1)
302 if not match:
303 return -1
304 j = match.start(0)
305 # Now parse the data between i+1 and j into a tag and attrs
306 attrs = []
307 if rawdata[i:i+2] == '<>':
308 # SGML shorthand: <> == <last open tag seen>
309 k = j
310 tag = self.lasttag
311 else:
312 match = tagfind.match(rawdata, i+1)
313 if not match:
Fred Drake66957372001-03-16 20:04:57 +0000314 raise SGMLParseError('unexpected call to parse_starttag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000315 k = match.end(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000316 tag = rawdata[i+1:k].lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000317 self.lasttag = tag
318 while k < j:
319 match = attrfind.match(rawdata, k)
320 if not match: break
321 attrname, rest, attrvalue = match.group(1, 2, 3)
322 if not rest:
323 attrvalue = attrname
324 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
325 attrvalue[:1] == '"' == attrvalue[-1:]:
326 attrvalue = attrvalue[1:-1]
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000327 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000328 k = match.end(0)
329 if rawdata[j] == '>':
330 j = j+1
Fred Drakeb46696c2000-06-29 18:50:59 +0000331 self.__starttag_text = rawdata[start_pos:j]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000332 self.finish_starttag(tag, attrs)
333 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000334
335 # Internal -- parse endtag
336 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000337 rawdata = self.rawdata
338 match = endbracket.search(rawdata, i+1)
339 if not match:
340 return -1
341 j = match.start(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000342 tag = rawdata[i+2:j].strip().lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000343 if rawdata[j] == '>':
344 j = j+1
345 self.finish_endtag(tag)
346 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000347
348 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
349 def finish_shorttag(self, tag, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000350 self.finish_starttag(tag, [])
351 self.handle_data(data)
352 self.finish_endtag(tag)
Guido van Rossum48766511996-03-28 18:45:04 +0000353
354 # Internal -- finish processing of start tag
355 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
356 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000357 try:
358 method = getattr(self, 'start_' + tag)
359 except AttributeError:
360 try:
361 method = getattr(self, 'do_' + tag)
362 except AttributeError:
363 self.unknown_starttag(tag, attrs)
364 return -1
365 else:
366 self.handle_starttag(tag, method, attrs)
367 return 0
368 else:
369 self.stack.append(tag)
370 self.handle_starttag(tag, method, attrs)
371 return 1
Guido van Rossum48766511996-03-28 18:45:04 +0000372
373 # Internal -- finish processing of end tag
374 def finish_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000375 if not tag:
376 found = len(self.stack) - 1
377 if found < 0:
378 self.unknown_endtag(tag)
379 return
380 else:
381 if tag not in self.stack:
382 try:
383 method = getattr(self, 'end_' + tag)
384 except AttributeError:
385 self.unknown_endtag(tag)
Guido van Rossumb84ef9b1998-07-07 22:46:11 +0000386 else:
387 self.report_unbalanced(tag)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000388 return
389 found = len(self.stack)
390 for i in range(found):
391 if self.stack[i] == tag: found = i
392 while len(self.stack) > found:
393 tag = self.stack[-1]
394 try:
395 method = getattr(self, 'end_' + tag)
396 except AttributeError:
397 method = None
398 if method:
399 self.handle_endtag(tag, method)
400 else:
401 self.unknown_endtag(tag)
402 del self.stack[-1]
Guido van Rossum7c750e11995-02-27 13:16:55 +0000403
Guido van Rossum48766511996-03-28 18:45:04 +0000404 # Overridable -- handle start tag
405 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000406 method(attrs)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000407
Guido van Rossum48766511996-03-28 18:45:04 +0000408 # Overridable -- handle end tag
409 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000410 method()
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000411
Guido van Rossum48766511996-03-28 18:45:04 +0000412 # Example -- report an unbalanced </...> tag.
413 def report_unbalanced(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000414 if self.verbose:
415 print '*** Unbalanced </' + tag + '>'
416 print '*** Stack:', self.stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000417
Guido van Rossum48766511996-03-28 18:45:04 +0000418 def handle_charref(self, name):
Fred Drake08f8dd62001-07-19 20:08:04 +0000419 """Handle character reference, no need to override."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000420 try:
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000421 n = int(name)
Eric S. Raymond18af5642001-02-09 10:12:19 +0000422 except ValueError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000423 self.unknown_charref(name)
424 return
425 if not 0 <= n <= 255:
426 self.unknown_charref(name)
427 return
428 self.handle_data(chr(n))
Guido van Rossum7c750e11995-02-27 13:16:55 +0000429
Guido van Rossum48766511996-03-28 18:45:04 +0000430 # Definition of entities -- derived classes may override
431 entitydefs = \
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000432 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
Guido van Rossum7c750e11995-02-27 13:16:55 +0000433
Guido van Rossum48766511996-03-28 18:45:04 +0000434 def handle_entityref(self, name):
Fred Drake08f8dd62001-07-19 20:08:04 +0000435 """Handle entity references.
436
437 There should be no need to override this method; it can be
438 tailored by setting up the self.entitydefs mapping appropriately.
439 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000440 table = self.entitydefs
441 if table.has_key(name):
442 self.handle_data(table[name])
443 else:
444 self.unknown_entityref(name)
445 return
Guido van Rossum7c750e11995-02-27 13:16:55 +0000446
Guido van Rossum48766511996-03-28 18:45:04 +0000447 # Example -- handle data, should be overridden
448 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000449 pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000450
Guido van Rossum48766511996-03-28 18:45:04 +0000451 # Example -- handle comment, could be overridden
452 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000453 pass
Guido van Rossum48766511996-03-28 18:45:04 +0000454
Fred Drake66957372001-03-16 20:04:57 +0000455 # Example -- handle declaration, could be overridden
456 def handle_decl(self, decl):
457 pass
458
Guido van Rossum1ad00711998-05-28 22:48:53 +0000459 # Example -- handle processing instruction, could be overridden
460 def handle_pi(self, data):
461 pass
462
Guido van Rossum48766511996-03-28 18:45:04 +0000463 # To be overridden -- handlers for unknown objects
464 def unknown_starttag(self, tag, attrs): pass
465 def unknown_endtag(self, tag): pass
466 def unknown_charref(self, ref): pass
467 def unknown_entityref(self, ref): pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000468
469
Guido van Rossum48766511996-03-28 18:45:04 +0000470class TestSGMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +0000471
Guido van Rossum48766511996-03-28 18:45:04 +0000472 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000473 self.testdata = ""
474 SGMLParser.__init__(self, verbose)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000475
Guido van Rossum48766511996-03-28 18:45:04 +0000476 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000477 self.testdata = self.testdata + data
478 if len(`self.testdata`) >= 70:
479 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000480
Guido van Rossum48766511996-03-28 18:45:04 +0000481 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000482 data = self.testdata
483 if data:
484 self.testdata = ""
485 print 'data:', `data`
Guido van Rossum7c750e11995-02-27 13:16:55 +0000486
Guido van Rossum48766511996-03-28 18:45:04 +0000487 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000488 self.flush()
489 r = `data`
490 if len(r) > 68:
491 r = r[:32] + '...' + r[-32:]
492 print 'comment:', r
Guido van Rossum7c750e11995-02-27 13:16:55 +0000493
Guido van Rossum48766511996-03-28 18:45:04 +0000494 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000495 self.flush()
496 if not attrs:
497 print 'start tag: <' + tag + '>'
498 else:
499 print 'start tag: <' + tag,
500 for name, value in attrs:
501 print name + '=' + '"' + value + '"',
502 print '>'
Guido van Rossum7c750e11995-02-27 13:16:55 +0000503
Guido van Rossum48766511996-03-28 18:45:04 +0000504 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000505 self.flush()
506 print 'end tag: </' + tag + '>'
Guido van Rossum48766511996-03-28 18:45:04 +0000507
508 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000509 self.flush()
510 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000511
512 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000513 self.flush()
514 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000515
516 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000517 SGMLParser.close(self)
518 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000519
520
Guido van Rossum48766511996-03-28 18:45:04 +0000521def test(args = None):
522 import sys
523
524 if not args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000525 args = sys.argv[1:]
Guido van Rossum48766511996-03-28 18:45:04 +0000526
527 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000528 args = args[1:]
529 klass = SGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000530 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000531 klass = TestSGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000532
533 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000534 file = args[0]
Guido van Rossum48766511996-03-28 18:45:04 +0000535 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000536 file = 'test.html'
Guido van Rossum48766511996-03-28 18:45:04 +0000537
538 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000539 f = sys.stdin
Guido van Rossum48766511996-03-28 18:45:04 +0000540 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000541 try:
542 f = open(file, 'r')
543 except IOError, msg:
544 print file, ":", msg
545 sys.exit(1)
Guido van Rossum48766511996-03-28 18:45:04 +0000546
547 data = f.read()
548 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000549 f.close()
Guido van Rossum48766511996-03-28 18:45:04 +0000550
551 x = klass()
552 for c in data:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000553 x.feed(c)
Guido van Rossum48766511996-03-28 18:45:04 +0000554 x.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000555
556
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000557if __name__ == '__main__':
Guido van Rossum48766511996-03-28 18:45:04 +0000558 test()