blob: 3ab57c23071707b0a23c29e58f35afaa09306f37 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""A parser for SGML, using the derived class as a static DTD."""
Guido van Rossum7c750e11995-02-27 13:16:55 +00002
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
Fred Drakefb38c762001-07-16 18:30:35 +00008# and CDATA (character data -- only end tags are special). RCDATA is
9# not supported at all.
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
11
Fred Drakea3bae332001-09-24 20:15:51 +000012import markupbase
Guido van Rossum1fef1811997-10-23 19:09:21 +000013import re
Guido van Rossum7c750e11995-02-27 13:16:55 +000014
Fred Drake58ae8302004-09-09 01:49:58 +000015__all__ = ["SGMLParser", "SGMLParseError"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000016
17# Regular expressions used for parsing
18
Guido van Rossum1fef1811997-10-23 19:09:21 +000019interesting = re.compile('[&<]')
20incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000021 '<([a-zA-Z][^<>]*|'
22 '/([a-zA-Z][^<>]*)?|'
23 '![^<>]*)?')
Guido van Rossum48766511996-03-28 18:45:04 +000024
Guido van Rossum1ad00711998-05-28 22:48:53 +000025entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Guido van Rossum1fef1811997-10-23 19:09:21 +000026charref = re.compile('&#([0-9]+)[^0-9]')
Guido van Rossum48766511996-03-28 18:45:04 +000027
Guido van Rossum1fef1811997-10-23 19:09:21 +000028starttagopen = re.compile('<[>a-zA-Z]')
Guido van Rossum5fdf8521998-08-24 20:59:13 +000029shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
Guido van Rossum1ad00711998-05-28 22:48:53 +000031piclose = re.compile('>')
Neal Norwitz48829ba2006-09-11 04:05:18 +000032endbracket = re.compile('[<>]')
Fred Drakedc191632001-07-05 18:21:57 +000033tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
Guido van Rossum1fef1811997-10-23 19:09:21 +000034attrfind = re.compile(
Fred Drake8600b472001-07-14 05:50:33 +000035 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
Fred Drake2f99da62006-06-23 06:03:45 +000036 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
Guido van Rossum7c750e11995-02-27 13:16:55 +000037
Fred Drake66957372001-03-16 20:04:57 +000038
39class SGMLParseError(RuntimeError):
40 """Exception raised for all parse errors."""
41 pass
42
Guido van Rossum7c750e11995-02-27 13:16:55 +000043
44# SGML parser base class -- find tags and call handler functions.
45# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
46# The dtd is defined by deriving a class which defines methods
47# with special names to handle tags: start_foo and end_foo to handle
48# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
49# (Tags are converted to lower case for this purpose.) The data
50# between tags is passed to the parser by calling self.handle_data()
Jeremy Hyltona05e2932000-06-28 14:48:01 +000051# with some data as argument (the data may be split up in arbitrary
Guido van Rossum7c750e11995-02-27 13:16:55 +000052# chunks). Entity references are passed by calling
53# self.handle_entityref() with the entity reference as argument.
54
Fred Drakea3bae332001-09-24 20:15:51 +000055class SGMLParser(markupbase.ParserBase):
Fred Drakefab461a2006-06-16 23:45:06 +000056 # Definition of entities -- derived classes may override
57 entity_or_charref = re.compile('&(?:'
58 '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
59 ')(;?)')
Guido van Rossum7c750e11995-02-27 13:16:55 +000060
Guido van Rossum48766511996-03-28 18:45:04 +000061 def __init__(self, verbose=0):
Fred Drake08f8dd62001-07-19 20:08:04 +000062 """Initialize and reset this instance."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000063 self.verbose = verbose
64 self.reset()
Guido van Rossum7c750e11995-02-27 13:16:55 +000065
Guido van Rossum48766511996-03-28 18:45:04 +000066 def reset(self):
Fred Drake08f8dd62001-07-19 20:08:04 +000067 """Reset this instance. Loses all unprocessed data."""
Martin v. Löwisdc14ab12003-09-20 10:58:38 +000068 self.__starttag_text = None
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000069 self.rawdata = ''
70 self.stack = []
71 self.lasttag = '???'
72 self.nomoretags = 0
73 self.literal = 0
Fred Drakea3bae332001-09-24 20:15:51 +000074 markupbase.ParserBase.reset(self)
Guido van Rossum7c750e11995-02-27 13:16:55 +000075
Guido van Rossum48766511996-03-28 18:45:04 +000076 def setnomoretags(self):
Fred Drake390e9db2001-07-19 20:57:23 +000077 """Enter literal mode (CDATA) till EOF.
78
79 Intended for derived classes only.
80 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000081 self.nomoretags = self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000082
Guido van Rossum48766511996-03-28 18:45:04 +000083 def setliteral(self, *args):
Fred Drake390e9db2001-07-19 20:57:23 +000084 """Enter literal mode (CDATA).
85
86 Intended for derived classes only.
87 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000088 self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000089
Guido van Rossum48766511996-03-28 18:45:04 +000090 def feed(self, data):
Fred Drake390e9db2001-07-19 20:57:23 +000091 """Feed some data to the parser.
92
93 Call this as often as you want, with as little or as much text
94 as you want (may include '\n'). (This just saves the text,
95 all the processing is done by goahead().)
96 """
97
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000098 self.rawdata = self.rawdata + data
99 self.goahead(0)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000100
Guido van Rossum48766511996-03-28 18:45:04 +0000101 def close(self):
Fred Drake08f8dd62001-07-19 20:08:04 +0000102 """Handle the remaining data."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000103 self.goahead(1)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000104
Fred Drakea3bae332001-09-24 20:15:51 +0000105 def error(self, message):
106 raise SGMLParseError(message)
107
Guido van Rossum48766511996-03-28 18:45:04 +0000108 # Internal -- handle data as far as reasonable. May leave state
109 # and data to be processed by a subsequent call. If 'end' is
110 # true, force handling all data as if followed by EOF marker.
111 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000112 rawdata = self.rawdata
113 i = 0
114 n = len(rawdata)
115 while i < n:
116 if self.nomoretags:
117 self.handle_data(rawdata[i:n])
118 i = n
119 break
120 match = interesting.search(rawdata, i)
Fred Drakea3bae332001-09-24 20:15:51 +0000121 if match: j = match.start()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000122 else: j = n
Fred Drakea3bae332001-09-24 20:15:51 +0000123 if i < j:
124 self.handle_data(rawdata[i:j])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000125 i = j
126 if i == n: break
127 if rawdata[i] == '<':
128 if starttagopen.match(rawdata, i):
129 if self.literal:
130 self.handle_data(rawdata[i])
131 i = i+1
132 continue
133 k = self.parse_starttag(i)
134 if k < 0: break
135 i = k
136 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000137 if rawdata.startswith("</", i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000138 k = self.parse_endtag(i)
139 if k < 0: break
Fred Drakea3bae332001-09-24 20:15:51 +0000140 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000141 self.literal = 0
142 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000143 if self.literal:
144 if n > (i + 1):
145 self.handle_data("<")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000146 i = i+1
Fred Drakea3bae332001-09-24 20:15:51 +0000147 else:
148 # incomplete
149 break
150 continue
151 if rawdata.startswith("<!--", i):
Tim Peters0eadaac2003-04-24 16:02:54 +0000152 # Strictly speaking, a comment is --.*--
153 # within a declaration tag <!...>.
154 # This should be removed,
155 # and comments handled only in parse_declaration.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000156 k = self.parse_comment(i)
157 if k < 0: break
Fred Drakea3bae332001-09-24 20:15:51 +0000158 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000159 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000160 if rawdata.startswith("<?", i):
Guido van Rossum1ad00711998-05-28 22:48:53 +0000161 k = self.parse_pi(i)
162 if k < 0: break
163 i = i+k
Tim Peters495ad3c2001-01-15 01:36:40 +0000164 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000165 if rawdata.startswith("<!", i):
Fred Drake66957372001-03-16 20:04:57 +0000166 # This is some sort of declaration; in "HTML as
167 # deployed," this should only be the document type
168 # declaration ("<!DOCTYPE html...>").
169 k = self.parse_declaration(i)
170 if k < 0: break
171 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000172 continue
173 elif rawdata[i] == '&':
Fred Drakefb38c762001-07-16 18:30:35 +0000174 if self.literal:
175 self.handle_data(rawdata[i])
176 i = i+1
177 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000178 match = charref.match(rawdata, i)
179 if match:
180 name = match.group(1)
181 self.handle_charref(name)
182 i = match.end(0)
183 if rawdata[i-1] != ';': i = i-1
184 continue
185 match = entityref.match(rawdata, i)
186 if match:
187 name = match.group(1)
188 self.handle_entityref(name)
189 i = match.end(0)
190 if rawdata[i-1] != ';': i = i-1
191 continue
192 else:
Fred Drakea3bae332001-09-24 20:15:51 +0000193 self.error('neither < nor & ??')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000194 # We get here only if incomplete matches but
195 # nothing else
196 match = incomplete.match(rawdata, i)
197 if not match:
198 self.handle_data(rawdata[i])
199 i = i+1
200 continue
201 j = match.end(0)
202 if j == n:
203 break # Really incomplete
204 self.handle_data(rawdata[i:j])
205 i = j
206 # end while
207 if end and i < n:
208 self.handle_data(rawdata[i:n])
209 i = n
210 self.rawdata = rawdata[i:]
211 # XXX if end: check for empty stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000212
Fred Drakea3bae332001-09-24 20:15:51 +0000213 # Extensions for the DOCTYPE scanner:
214 _decl_otherchars = '='
Fred Drake66957372001-03-16 20:04:57 +0000215
Guido van Rossum1ad00711998-05-28 22:48:53 +0000216 # Internal -- parse processing instr, return length or -1 if not terminated
217 def parse_pi(self, i):
218 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000219 if rawdata[i:i+2] != '<?':
Fred Drakea3bae332001-09-24 20:15:51 +0000220 self.error('unexpected call to parse_pi()')
Guido van Rossum1ad00711998-05-28 22:48:53 +0000221 match = piclose.search(rawdata, i+2)
222 if not match:
223 return -1
224 j = match.start(0)
225 self.handle_pi(rawdata[i+2: j])
226 j = match.end(0)
227 return j-i
Fred Drakeb46696c2000-06-29 18:50:59 +0000228
Fred Drakeb46696c2000-06-29 18:50:59 +0000229 def get_starttag_text(self):
230 return self.__starttag_text
Tim Peters495ad3c2001-01-15 01:36:40 +0000231
Guido van Rossum48766511996-03-28 18:45:04 +0000232 # Internal -- handle starttag, return length or -1 if not terminated
233 def parse_starttag(self, i):
Fred Drakeb46696c2000-06-29 18:50:59 +0000234 self.__starttag_text = None
235 start_pos = i
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000236 rawdata = self.rawdata
237 if shorttagopen.match(rawdata, i):
238 # SGML shorthand: <tag/data/ == <tag>data</tag>
239 # XXX Can data contain &... (entity or char refs)?
240 # XXX Can data contain < or > (tag characters)?
241 # XXX Can there be whitespace before the first /?
242 match = shorttag.match(rawdata, i)
243 if not match:
244 return -1
245 tag, data = match.group(1, 2)
Fred Drakeb46696c2000-06-29 18:50:59 +0000246 self.__starttag_text = '<%s/' % tag
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000247 tag = tag.lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000248 k = match.end(0)
Fred Drakeb46696c2000-06-29 18:50:59 +0000249 self.finish_shorttag(tag, data)
250 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000251 return k
Neal Norwitz48829ba2006-09-11 04:05:18 +0000252 # XXX The following should skip matching quotes (' or ")
253 # As a shortcut way to exit, this isn't so bad, but shouldn't
254 # be used to locate the actual end of the start tag since the
255 # < or > characters may be embedded in an attribute value.
256 match = endbracket.search(rawdata, i+1)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000257 if not match:
258 return -1
Neal Norwitz48829ba2006-09-11 04:05:18 +0000259 j = match.start(0)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000260 # Now parse the data between i+1 and j into a tag and attrs
261 attrs = []
262 if rawdata[i:i+2] == '<>':
263 # SGML shorthand: <> == <last open tag seen>
264 k = j
265 tag = self.lasttag
266 else:
267 match = tagfind.match(rawdata, i+1)
268 if not match:
Fred Drakea3bae332001-09-24 20:15:51 +0000269 self.error('unexpected call to parse_starttag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000270 k = match.end(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000271 tag = rawdata[i+1:k].lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000272 self.lasttag = tag
273 while k < j:
274 match = attrfind.match(rawdata, k)
275 if not match: break
276 attrname, rest, attrvalue = match.group(1, 2, 3)
277 if not rest:
278 attrvalue = attrname
Georg Brandl7f6b67c2006-04-01 08:35:18 +0000279 else:
Tim Peters480725d2006-04-03 02:46:44 +0000280 if (attrvalue[:1] == "'" == attrvalue[-1:] or
Georg Brandl7f6b67c2006-04-01 08:35:18 +0000281 attrvalue[:1] == '"' == attrvalue[-1:]):
282 # strip quotes
283 attrvalue = attrvalue[1:-1]
Fred Drakefab461a2006-06-16 23:45:06 +0000284 attrvalue = self.entity_or_charref.sub(
285 self._convert_ref, attrvalue)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000286 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000287 k = match.end(0)
288 if rawdata[j] == '>':
289 j = j+1
Fred Drakeb46696c2000-06-29 18:50:59 +0000290 self.__starttag_text = rawdata[start_pos:j]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000291 self.finish_starttag(tag, attrs)
292 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000293
Fred Drakefab461a2006-06-16 23:45:06 +0000294 # Internal -- convert entity or character reference
295 def _convert_ref(self, match):
296 if match.group(2):
297 return self.convert_charref(match.group(2)) or \
298 '&#%s%s' % match.groups()[1:]
299 elif match.group(3):
300 return self.convert_entityref(match.group(1)) or \
301 '&%s;' % match.group(1)
302 else:
303 return '&%s' % match.group(1)
304
Guido van Rossum48766511996-03-28 18:45:04 +0000305 # Internal -- parse endtag
306 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000307 rawdata = self.rawdata
Neal Norwitz48829ba2006-09-11 04:05:18 +0000308 match = endbracket.search(rawdata, i+1)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000309 if not match:
310 return -1
Neal Norwitz48829ba2006-09-11 04:05:18 +0000311 j = match.start(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000312 tag = rawdata[i+2:j].strip().lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000313 if rawdata[j] == '>':
314 j = j+1
315 self.finish_endtag(tag)
316 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000317
318 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
319 def finish_shorttag(self, tag, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000320 self.finish_starttag(tag, [])
321 self.handle_data(data)
322 self.finish_endtag(tag)
Guido van Rossum48766511996-03-28 18:45:04 +0000323
324 # Internal -- finish processing of start tag
325 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
326 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000327 try:
328 method = getattr(self, 'start_' + tag)
329 except AttributeError:
330 try:
331 method = getattr(self, 'do_' + tag)
332 except AttributeError:
333 self.unknown_starttag(tag, attrs)
334 return -1
335 else:
336 self.handle_starttag(tag, method, attrs)
337 return 0
338 else:
339 self.stack.append(tag)
340 self.handle_starttag(tag, method, attrs)
341 return 1
Guido van Rossum48766511996-03-28 18:45:04 +0000342
343 # Internal -- finish processing of end tag
344 def finish_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000345 if not tag:
346 found = len(self.stack) - 1
347 if found < 0:
348 self.unknown_endtag(tag)
349 return
350 else:
351 if tag not in self.stack:
352 try:
353 method = getattr(self, 'end_' + tag)
354 except AttributeError:
355 self.unknown_endtag(tag)
Guido van Rossumb84ef9b1998-07-07 22:46:11 +0000356 else:
357 self.report_unbalanced(tag)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000358 return
359 found = len(self.stack)
360 for i in range(found):
361 if self.stack[i] == tag: found = i
362 while len(self.stack) > found:
363 tag = self.stack[-1]
364 try:
365 method = getattr(self, 'end_' + tag)
366 except AttributeError:
367 method = None
368 if method:
369 self.handle_endtag(tag, method)
370 else:
371 self.unknown_endtag(tag)
372 del self.stack[-1]
Guido van Rossum7c750e11995-02-27 13:16:55 +0000373
Guido van Rossum48766511996-03-28 18:45:04 +0000374 # Overridable -- handle start tag
375 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000376 method(attrs)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000377
Guido van Rossum48766511996-03-28 18:45:04 +0000378 # Overridable -- handle end tag
379 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000380 method()
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000381
Guido van Rossum48766511996-03-28 18:45:04 +0000382 # Example -- report an unbalanced </...> tag.
383 def report_unbalanced(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000384 if self.verbose:
385 print '*** Unbalanced </' + tag + '>'
386 print '*** Stack:', self.stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000387
Fred Drakefab461a2006-06-16 23:45:06 +0000388 def convert_charref(self, name):
389 """Convert character reference, may be overridden."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000390 try:
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000391 n = int(name)
Eric S. Raymond18af5642001-02-09 10:12:19 +0000392 except ValueError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000393 return
394 if not 0 <= n <= 255:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000395 return
Fred Drakefab461a2006-06-16 23:45:06 +0000396 return self.convert_codepoint(n)
397
398 def convert_codepoint(self, codepoint):
399 return chr(codepoint)
400
401 def handle_charref(self, name):
402 """Handle character reference, no need to override."""
Fred Drake2f99da62006-06-23 06:03:45 +0000403 replacement = self.convert_charref(name)
Fred Drakefab461a2006-06-16 23:45:06 +0000404 if replacement is None:
405 self.unknown_charref(name)
406 else:
Fred Drake2f99da62006-06-23 06:03:45 +0000407 self.handle_data(replacement)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000408
Guido van Rossum48766511996-03-28 18:45:04 +0000409 # Definition of entities -- derived classes may override
410 entitydefs = \
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000411 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
Guido van Rossum7c750e11995-02-27 13:16:55 +0000412
Fred Drakefab461a2006-06-16 23:45:06 +0000413 def convert_entityref(self, name):
414 """Convert entity references.
Fred Drake08f8dd62001-07-19 20:08:04 +0000415
Fred Drakefab461a2006-06-16 23:45:06 +0000416 As an alternative to overriding this method; one can tailor the
417 results by setting up the self.entitydefs mapping appropriately.
Fred Drake08f8dd62001-07-19 20:08:04 +0000418 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000419 table = self.entitydefs
Raymond Hettinger54f02222002-06-01 14:18:47 +0000420 if name in table:
Fred Drakefab461a2006-06-16 23:45:06 +0000421 return table[name]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000422 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000423 return
Guido van Rossum7c750e11995-02-27 13:16:55 +0000424
Fred Drakefab461a2006-06-16 23:45:06 +0000425 def handle_entityref(self, name):
426 """Handle entity references, no need to override."""
Fred Drake54166052006-06-17 01:07:54 +0000427 replacement = self.convert_entityref(name)
Fred Drakefab461a2006-06-16 23:45:06 +0000428 if replacement is None:
429 self.unknown_entityref(name)
430 else:
Fred Drake54166052006-06-17 01:07:54 +0000431 self.handle_data(self.convert_entityref(name))
Fred Drakefab461a2006-06-16 23:45:06 +0000432
Guido van Rossum48766511996-03-28 18:45:04 +0000433 # Example -- handle data, should be overridden
434 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000435 pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000436
Guido van Rossum48766511996-03-28 18:45:04 +0000437 # Example -- handle comment, could be overridden
438 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000439 pass
Guido van Rossum48766511996-03-28 18:45:04 +0000440
Fred Drake66957372001-03-16 20:04:57 +0000441 # Example -- handle declaration, could be overridden
442 def handle_decl(self, decl):
443 pass
444
Guido van Rossum1ad00711998-05-28 22:48:53 +0000445 # Example -- handle processing instruction, could be overridden
446 def handle_pi(self, data):
447 pass
448
Guido van Rossum48766511996-03-28 18:45:04 +0000449 # To be overridden -- handlers for unknown objects
450 def unknown_starttag(self, tag, attrs): pass
451 def unknown_endtag(self, tag): pass
452 def unknown_charref(self, ref): pass
453 def unknown_entityref(self, ref): pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000454
455
Guido van Rossum48766511996-03-28 18:45:04 +0000456class TestSGMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +0000457
Guido van Rossum48766511996-03-28 18:45:04 +0000458 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000459 self.testdata = ""
460 SGMLParser.__init__(self, verbose)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000461
Guido van Rossum48766511996-03-28 18:45:04 +0000462 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000463 self.testdata = self.testdata + data
Walter Dörwald70a6b492004-02-12 17:35:32 +0000464 if len(repr(self.testdata)) >= 70:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000465 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000466
Guido van Rossum48766511996-03-28 18:45:04 +0000467 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000468 data = self.testdata
469 if data:
470 self.testdata = ""
Walter Dörwald70a6b492004-02-12 17:35:32 +0000471 print 'data:', repr(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000472
Guido van Rossum48766511996-03-28 18:45:04 +0000473 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000474 self.flush()
Walter Dörwald70a6b492004-02-12 17:35:32 +0000475 r = repr(data)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000476 if len(r) > 68:
477 r = r[:32] + '...' + r[-32:]
478 print 'comment:', r
Guido van Rossum7c750e11995-02-27 13:16:55 +0000479
Guido van Rossum48766511996-03-28 18:45:04 +0000480 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000481 self.flush()
482 if not attrs:
483 print 'start tag: <' + tag + '>'
484 else:
485 print 'start tag: <' + tag,
486 for name, value in attrs:
487 print name + '=' + '"' + value + '"',
488 print '>'
Guido van Rossum7c750e11995-02-27 13:16:55 +0000489
Guido van Rossum48766511996-03-28 18:45:04 +0000490 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000491 self.flush()
492 print 'end tag: </' + tag + '>'
Guido van Rossum48766511996-03-28 18:45:04 +0000493
494 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000495 self.flush()
496 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000497
498 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000499 self.flush()
500 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000501
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000502 def unknown_decl(self, data):
503 self.flush()
504 print '*** unknown decl: [' + data + ']'
505
Guido van Rossum48766511996-03-28 18:45:04 +0000506 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000507 SGMLParser.close(self)
508 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000509
510
Guido van Rossum48766511996-03-28 18:45:04 +0000511def test(args = None):
512 import sys
513
Raymond Hettingerf13eb552002-06-02 00:40:05 +0000514 if args is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000515 args = sys.argv[1:]
Guido van Rossum48766511996-03-28 18:45:04 +0000516
517 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000518 args = args[1:]
519 klass = SGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000520 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000521 klass = TestSGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000522
523 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000524 file = args[0]
Guido van Rossum48766511996-03-28 18:45:04 +0000525 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000526 file = 'test.html'
Guido van Rossum48766511996-03-28 18:45:04 +0000527
528 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000529 f = sys.stdin
Guido van Rossum48766511996-03-28 18:45:04 +0000530 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000531 try:
532 f = open(file, 'r')
533 except IOError, msg:
534 print file, ":", msg
535 sys.exit(1)
Guido van Rossum48766511996-03-28 18:45:04 +0000536
537 data = f.read()
538 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000539 f.close()
Guido van Rossum48766511996-03-28 18:45:04 +0000540
541 x = klass()
542 for c in data:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000543 x.feed(c)
Guido van Rossum48766511996-03-28 18:45:04 +0000544 x.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000545
546
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000547if __name__ == '__main__':
Guido van Rossum48766511996-03-28 18:45:04 +0000548 test()