blob: 104b25f2a07b0e716c40328d1878f56e774d8169 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""A parser for SGML, using the derived class as a static DTD."""
Guido van Rossum7c750e11995-02-27 13:16:55 +00002
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
Fred Drakefb38c762001-07-16 18:30:35 +00008# and CDATA (character data -- only end tags are special). RCDATA is
9# not supported at all.
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
11
Georg Brandlac19d852008-06-01 21:19:14 +000012from warnings import warnpy3k
13warnpy3k("the sgmllib module has been removed in Python 3.0",
14 stacklevel=2)
15del warnpy3k
16
Fred Drakea3bae332001-09-24 20:15:51 +000017import markupbase
Guido van Rossum1fef1811997-10-23 19:09:21 +000018import re
Guido van Rossum7c750e11995-02-27 13:16:55 +000019
Fred Drake58ae8302004-09-09 01:49:58 +000020__all__ = ["SGMLParser", "SGMLParseError"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000021
22# Regular expressions used for parsing
23
Guido van Rossum1fef1811997-10-23 19:09:21 +000024interesting = re.compile('[&<]')
25incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000026 '<([a-zA-Z][^<>]*|'
27 '/([a-zA-Z][^<>]*)?|'
28 '![^<>]*)?')
Guido van Rossum48766511996-03-28 18:45:04 +000029
Guido van Rossum1ad00711998-05-28 22:48:53 +000030entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Guido van Rossum1fef1811997-10-23 19:09:21 +000031charref = re.compile('&#([0-9]+)[^0-9]')
Guido van Rossum48766511996-03-28 18:45:04 +000032
Guido van Rossum1fef1811997-10-23 19:09:21 +000033starttagopen = re.compile('<[>a-zA-Z]')
Guido van Rossum5fdf8521998-08-24 20:59:13 +000034shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
35shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
Guido van Rossum1ad00711998-05-28 22:48:53 +000036piclose = re.compile('>')
Neal Norwitzbcc119a2006-09-11 04:24:09 +000037endbracket = re.compile('[<>]')
Fred Drakedc191632001-07-05 18:21:57 +000038tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
Guido van Rossum1fef1811997-10-23 19:09:21 +000039attrfind = re.compile(
Fred Drake8600b472001-07-14 05:50:33 +000040 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
Fred Drake2f99da62006-06-23 06:03:45 +000041 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
Guido van Rossum7c750e11995-02-27 13:16:55 +000042
Fred Drake66957372001-03-16 20:04:57 +000043
44class SGMLParseError(RuntimeError):
45 """Exception raised for all parse errors."""
46 pass
47
Guido van Rossum7c750e11995-02-27 13:16:55 +000048
49# SGML parser base class -- find tags and call handler functions.
50# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
51# The dtd is defined by deriving a class which defines methods
52# with special names to handle tags: start_foo and end_foo to handle
53# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
54# (Tags are converted to lower case for this purpose.) The data
55# between tags is passed to the parser by calling self.handle_data()
Jeremy Hyltona05e2932000-06-28 14:48:01 +000056# with some data as argument (the data may be split up in arbitrary
Guido van Rossum7c750e11995-02-27 13:16:55 +000057# chunks). Entity references are passed by calling
58# self.handle_entityref() with the entity reference as argument.
59
Fred Drakea3bae332001-09-24 20:15:51 +000060class SGMLParser(markupbase.ParserBase):
Fred Drakefab461a2006-06-16 23:45:06 +000061 # Definition of entities -- derived classes may override
62 entity_or_charref = re.compile('&(?:'
63 '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
64 ')(;?)')
Guido van Rossum7c750e11995-02-27 13:16:55 +000065
Guido van Rossum48766511996-03-28 18:45:04 +000066 def __init__(self, verbose=0):
Fred Drake08f8dd62001-07-19 20:08:04 +000067 """Initialize and reset this instance."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000068 self.verbose = verbose
69 self.reset()
Guido van Rossum7c750e11995-02-27 13:16:55 +000070
Guido van Rossum48766511996-03-28 18:45:04 +000071 def reset(self):
Fred Drake08f8dd62001-07-19 20:08:04 +000072 """Reset this instance. Loses all unprocessed data."""
Martin v. Löwisdc14ab12003-09-20 10:58:38 +000073 self.__starttag_text = None
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000074 self.rawdata = ''
75 self.stack = []
76 self.lasttag = '???'
77 self.nomoretags = 0
78 self.literal = 0
Fred Drakea3bae332001-09-24 20:15:51 +000079 markupbase.ParserBase.reset(self)
Guido van Rossum7c750e11995-02-27 13:16:55 +000080
Guido van Rossum48766511996-03-28 18:45:04 +000081 def setnomoretags(self):
Fred Drake390e9db2001-07-19 20:57:23 +000082 """Enter literal mode (CDATA) till EOF.
83
84 Intended for derived classes only.
85 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000086 self.nomoretags = self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000087
Guido van Rossum48766511996-03-28 18:45:04 +000088 def setliteral(self, *args):
Fred Drake390e9db2001-07-19 20:57:23 +000089 """Enter literal mode (CDATA).
90
91 Intended for derived classes only.
92 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000093 self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000094
Guido van Rossum48766511996-03-28 18:45:04 +000095 def feed(self, data):
Fred Drake390e9db2001-07-19 20:57:23 +000096 """Feed some data to the parser.
97
98 Call this as often as you want, with as little or as much text
99 as you want (may include '\n'). (This just saves the text,
100 all the processing is done by goahead().)
101 """
102
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000103 self.rawdata = self.rawdata + data
104 self.goahead(0)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000105
Guido van Rossum48766511996-03-28 18:45:04 +0000106 def close(self):
Fred Drake08f8dd62001-07-19 20:08:04 +0000107 """Handle the remaining data."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000108 self.goahead(1)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000109
Fred Drakea3bae332001-09-24 20:15:51 +0000110 def error(self, message):
111 raise SGMLParseError(message)
112
Guido van Rossum48766511996-03-28 18:45:04 +0000113 # Internal -- handle data as far as reasonable. May leave state
114 # and data to be processed by a subsequent call. If 'end' is
115 # true, force handling all data as if followed by EOF marker.
116 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000117 rawdata = self.rawdata
118 i = 0
119 n = len(rawdata)
120 while i < n:
121 if self.nomoretags:
122 self.handle_data(rawdata[i:n])
123 i = n
124 break
125 match = interesting.search(rawdata, i)
Fred Drakea3bae332001-09-24 20:15:51 +0000126 if match: j = match.start()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000127 else: j = n
Fred Drakea3bae332001-09-24 20:15:51 +0000128 if i < j:
129 self.handle_data(rawdata[i:j])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000130 i = j
131 if i == n: break
132 if rawdata[i] == '<':
133 if starttagopen.match(rawdata, i):
134 if self.literal:
135 self.handle_data(rawdata[i])
136 i = i+1
137 continue
138 k = self.parse_starttag(i)
139 if k < 0: break
140 i = k
141 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000142 if rawdata.startswith("</", i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000143 k = self.parse_endtag(i)
144 if k < 0: break
Fred Drakea3bae332001-09-24 20:15:51 +0000145 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000146 self.literal = 0
147 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000148 if self.literal:
149 if n > (i + 1):
150 self.handle_data("<")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000151 i = i+1
Fred Drakea3bae332001-09-24 20:15:51 +0000152 else:
153 # incomplete
154 break
155 continue
156 if rawdata.startswith("<!--", i):
Tim Peters0eadaac2003-04-24 16:02:54 +0000157 # Strictly speaking, a comment is --.*--
158 # within a declaration tag <!...>.
159 # This should be removed,
160 # and comments handled only in parse_declaration.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000161 k = self.parse_comment(i)
162 if k < 0: break
Fred Drakea3bae332001-09-24 20:15:51 +0000163 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000164 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000165 if rawdata.startswith("<?", i):
Guido van Rossum1ad00711998-05-28 22:48:53 +0000166 k = self.parse_pi(i)
167 if k < 0: break
168 i = i+k
Tim Peters495ad3c2001-01-15 01:36:40 +0000169 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000170 if rawdata.startswith("<!", i):
Fred Drake66957372001-03-16 20:04:57 +0000171 # This is some sort of declaration; in "HTML as
172 # deployed," this should only be the document type
173 # declaration ("<!DOCTYPE html...>").
174 k = self.parse_declaration(i)
175 if k < 0: break
176 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000177 continue
178 elif rawdata[i] == '&':
Fred Drakefb38c762001-07-16 18:30:35 +0000179 if self.literal:
180 self.handle_data(rawdata[i])
181 i = i+1
182 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000183 match = charref.match(rawdata, i)
184 if match:
185 name = match.group(1)
186 self.handle_charref(name)
187 i = match.end(0)
188 if rawdata[i-1] != ';': i = i-1
189 continue
190 match = entityref.match(rawdata, i)
191 if match:
192 name = match.group(1)
193 self.handle_entityref(name)
194 i = match.end(0)
195 if rawdata[i-1] != ';': i = i-1
196 continue
197 else:
Fred Drakea3bae332001-09-24 20:15:51 +0000198 self.error('neither < nor & ??')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000199 # We get here only if incomplete matches but
200 # nothing else
201 match = incomplete.match(rawdata, i)
202 if not match:
203 self.handle_data(rawdata[i])
204 i = i+1
205 continue
206 j = match.end(0)
207 if j == n:
208 break # Really incomplete
209 self.handle_data(rawdata[i:j])
210 i = j
211 # end while
212 if end and i < n:
213 self.handle_data(rawdata[i:n])
214 i = n
215 self.rawdata = rawdata[i:]
216 # XXX if end: check for empty stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000217
Fred Drakea3bae332001-09-24 20:15:51 +0000218 # Extensions for the DOCTYPE scanner:
219 _decl_otherchars = '='
Fred Drake66957372001-03-16 20:04:57 +0000220
Guido van Rossum1ad00711998-05-28 22:48:53 +0000221 # Internal -- parse processing instr, return length or -1 if not terminated
222 def parse_pi(self, i):
223 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000224 if rawdata[i:i+2] != '<?':
Fred Drakea3bae332001-09-24 20:15:51 +0000225 self.error('unexpected call to parse_pi()')
Guido van Rossum1ad00711998-05-28 22:48:53 +0000226 match = piclose.search(rawdata, i+2)
227 if not match:
228 return -1
229 j = match.start(0)
230 self.handle_pi(rawdata[i+2: j])
231 j = match.end(0)
232 return j-i
Fred Drakeb46696c2000-06-29 18:50:59 +0000233
Fred Drakeb46696c2000-06-29 18:50:59 +0000234 def get_starttag_text(self):
235 return self.__starttag_text
Tim Peters495ad3c2001-01-15 01:36:40 +0000236
Guido van Rossum48766511996-03-28 18:45:04 +0000237 # Internal -- handle starttag, return length or -1 if not terminated
238 def parse_starttag(self, i):
Fred Drakeb46696c2000-06-29 18:50:59 +0000239 self.__starttag_text = None
240 start_pos = i
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000241 rawdata = self.rawdata
242 if shorttagopen.match(rawdata, i):
243 # SGML shorthand: <tag/data/ == <tag>data</tag>
244 # XXX Can data contain &... (entity or char refs)?
245 # XXX Can data contain < or > (tag characters)?
246 # XXX Can there be whitespace before the first /?
247 match = shorttag.match(rawdata, i)
248 if not match:
249 return -1
250 tag, data = match.group(1, 2)
Fred Drakeb46696c2000-06-29 18:50:59 +0000251 self.__starttag_text = '<%s/' % tag
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000252 tag = tag.lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000253 k = match.end(0)
Fred Drakeb46696c2000-06-29 18:50:59 +0000254 self.finish_shorttag(tag, data)
255 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000256 return k
Neal Norwitzbcc119a2006-09-11 04:24:09 +0000257 # XXX The following should skip matching quotes (' or ")
258 # As a shortcut way to exit, this isn't so bad, but shouldn't
259 # be used to locate the actual end of the start tag since the
260 # < or > characters may be embedded in an attribute value.
261 match = endbracket.search(rawdata, i+1)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000262 if not match:
263 return -1
Neal Norwitzbcc119a2006-09-11 04:24:09 +0000264 j = match.start(0)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000265 # Now parse the data between i+1 and j into a tag and attrs
266 attrs = []
267 if rawdata[i:i+2] == '<>':
268 # SGML shorthand: <> == <last open tag seen>
269 k = j
270 tag = self.lasttag
271 else:
272 match = tagfind.match(rawdata, i+1)
273 if not match:
Fred Drakea3bae332001-09-24 20:15:51 +0000274 self.error('unexpected call to parse_starttag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000275 k = match.end(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000276 tag = rawdata[i+1:k].lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000277 self.lasttag = tag
278 while k < j:
279 match = attrfind.match(rawdata, k)
280 if not match: break
281 attrname, rest, attrvalue = match.group(1, 2, 3)
282 if not rest:
283 attrvalue = attrname
Georg Brandl7f6b67c2006-04-01 08:35:18 +0000284 else:
Tim Peters480725d2006-04-03 02:46:44 +0000285 if (attrvalue[:1] == "'" == attrvalue[-1:] or
Georg Brandl7f6b67c2006-04-01 08:35:18 +0000286 attrvalue[:1] == '"' == attrvalue[-1:]):
287 # strip quotes
288 attrvalue = attrvalue[1:-1]
Fred Drakefab461a2006-06-16 23:45:06 +0000289 attrvalue = self.entity_or_charref.sub(
290 self._convert_ref, attrvalue)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000291 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000292 k = match.end(0)
293 if rawdata[j] == '>':
294 j = j+1
Fred Drakeb46696c2000-06-29 18:50:59 +0000295 self.__starttag_text = rawdata[start_pos:j]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000296 self.finish_starttag(tag, attrs)
297 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000298
Fred Drakefab461a2006-06-16 23:45:06 +0000299 # Internal -- convert entity or character reference
300 def _convert_ref(self, match):
301 if match.group(2):
302 return self.convert_charref(match.group(2)) or \
303 '&#%s%s' % match.groups()[1:]
304 elif match.group(3):
305 return self.convert_entityref(match.group(1)) or \
306 '&%s;' % match.group(1)
307 else:
308 return '&%s' % match.group(1)
309
Guido van Rossum48766511996-03-28 18:45:04 +0000310 # Internal -- parse endtag
311 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000312 rawdata = self.rawdata
Neal Norwitzbcc119a2006-09-11 04:24:09 +0000313 match = endbracket.search(rawdata, i+1)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000314 if not match:
315 return -1
Neal Norwitzbcc119a2006-09-11 04:24:09 +0000316 j = match.start(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000317 tag = rawdata[i+2:j].strip().lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000318 if rawdata[j] == '>':
319 j = j+1
320 self.finish_endtag(tag)
321 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000322
323 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
324 def finish_shorttag(self, tag, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000325 self.finish_starttag(tag, [])
326 self.handle_data(data)
327 self.finish_endtag(tag)
Guido van Rossum48766511996-03-28 18:45:04 +0000328
329 # Internal -- finish processing of start tag
330 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
331 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000332 try:
333 method = getattr(self, 'start_' + tag)
334 except AttributeError:
335 try:
336 method = getattr(self, 'do_' + tag)
337 except AttributeError:
338 self.unknown_starttag(tag, attrs)
339 return -1
340 else:
341 self.handle_starttag(tag, method, attrs)
342 return 0
343 else:
344 self.stack.append(tag)
345 self.handle_starttag(tag, method, attrs)
346 return 1
Guido van Rossum48766511996-03-28 18:45:04 +0000347
348 # Internal -- finish processing of end tag
349 def finish_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000350 if not tag:
351 found = len(self.stack) - 1
352 if found < 0:
353 self.unknown_endtag(tag)
354 return
355 else:
356 if tag not in self.stack:
357 try:
358 method = getattr(self, 'end_' + tag)
359 except AttributeError:
360 self.unknown_endtag(tag)
Guido van Rossumb84ef9b1998-07-07 22:46:11 +0000361 else:
362 self.report_unbalanced(tag)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000363 return
364 found = len(self.stack)
365 for i in range(found):
366 if self.stack[i] == tag: found = i
367 while len(self.stack) > found:
368 tag = self.stack[-1]
369 try:
370 method = getattr(self, 'end_' + tag)
371 except AttributeError:
372 method = None
373 if method:
374 self.handle_endtag(tag, method)
375 else:
376 self.unknown_endtag(tag)
377 del self.stack[-1]
Guido van Rossum7c750e11995-02-27 13:16:55 +0000378
Guido van Rossum48766511996-03-28 18:45:04 +0000379 # Overridable -- handle start tag
380 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000381 method(attrs)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000382
Guido van Rossum48766511996-03-28 18:45:04 +0000383 # Overridable -- handle end tag
384 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000385 method()
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000386
Guido van Rossum48766511996-03-28 18:45:04 +0000387 # Example -- report an unbalanced </...> tag.
388 def report_unbalanced(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000389 if self.verbose:
390 print '*** Unbalanced </' + tag + '>'
391 print '*** Stack:', self.stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000392
Fred Drakefab461a2006-06-16 23:45:06 +0000393 def convert_charref(self, name):
394 """Convert character reference, may be overridden."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000395 try:
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000396 n = int(name)
Eric S. Raymond18af5642001-02-09 10:12:19 +0000397 except ValueError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000398 return
Georg Brandl0c7b2c92009-03-31 22:11:53 +0000399 if not 0 <= n <= 127:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000400 return
Fred Drakefab461a2006-06-16 23:45:06 +0000401 return self.convert_codepoint(n)
402
403 def convert_codepoint(self, codepoint):
404 return chr(codepoint)
405
406 def handle_charref(self, name):
407 """Handle character reference, no need to override."""
Fred Drake2f99da62006-06-23 06:03:45 +0000408 replacement = self.convert_charref(name)
Fred Drakefab461a2006-06-16 23:45:06 +0000409 if replacement is None:
410 self.unknown_charref(name)
411 else:
Fred Drake2f99da62006-06-23 06:03:45 +0000412 self.handle_data(replacement)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000413
Guido van Rossum48766511996-03-28 18:45:04 +0000414 # Definition of entities -- derived classes may override
415 entitydefs = \
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000416 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
Guido van Rossum7c750e11995-02-27 13:16:55 +0000417
Fred Drakefab461a2006-06-16 23:45:06 +0000418 def convert_entityref(self, name):
419 """Convert entity references.
Fred Drake08f8dd62001-07-19 20:08:04 +0000420
Fred Drakefab461a2006-06-16 23:45:06 +0000421 As an alternative to overriding this method; one can tailor the
422 results by setting up the self.entitydefs mapping appropriately.
Fred Drake08f8dd62001-07-19 20:08:04 +0000423 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000424 table = self.entitydefs
Raymond Hettinger54f02222002-06-01 14:18:47 +0000425 if name in table:
Fred Drakefab461a2006-06-16 23:45:06 +0000426 return table[name]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000427 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000428 return
Guido van Rossum7c750e11995-02-27 13:16:55 +0000429
Fred Drakefab461a2006-06-16 23:45:06 +0000430 def handle_entityref(self, name):
431 """Handle entity references, no need to override."""
Fred Drake54166052006-06-17 01:07:54 +0000432 replacement = self.convert_entityref(name)
Fred Drakefab461a2006-06-16 23:45:06 +0000433 if replacement is None:
434 self.unknown_entityref(name)
435 else:
Georg Brandl0f6d3602007-08-06 07:39:09 +0000436 self.handle_data(replacement)
Fred Drakefab461a2006-06-16 23:45:06 +0000437
Guido van Rossum48766511996-03-28 18:45:04 +0000438 # Example -- handle data, should be overridden
439 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000440 pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000441
Guido van Rossum48766511996-03-28 18:45:04 +0000442 # Example -- handle comment, could be overridden
443 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000444 pass
Guido van Rossum48766511996-03-28 18:45:04 +0000445
Fred Drake66957372001-03-16 20:04:57 +0000446 # Example -- handle declaration, could be overridden
447 def handle_decl(self, decl):
448 pass
449
Guido van Rossum1ad00711998-05-28 22:48:53 +0000450 # Example -- handle processing instruction, could be overridden
451 def handle_pi(self, data):
452 pass
453
Guido van Rossum48766511996-03-28 18:45:04 +0000454 # To be overridden -- handlers for unknown objects
455 def unknown_starttag(self, tag, attrs): pass
456 def unknown_endtag(self, tag): pass
457 def unknown_charref(self, ref): pass
458 def unknown_entityref(self, ref): pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000459
460
Guido van Rossum48766511996-03-28 18:45:04 +0000461class TestSGMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +0000462
Guido van Rossum48766511996-03-28 18:45:04 +0000463 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000464 self.testdata = ""
465 SGMLParser.__init__(self, verbose)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000466
Guido van Rossum48766511996-03-28 18:45:04 +0000467 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000468 self.testdata = self.testdata + data
Walter Dörwald70a6b492004-02-12 17:35:32 +0000469 if len(repr(self.testdata)) >= 70:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000470 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000471
Guido van Rossum48766511996-03-28 18:45:04 +0000472 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000473 data = self.testdata
474 if data:
475 self.testdata = ""
Walter Dörwald70a6b492004-02-12 17:35:32 +0000476 print 'data:', repr(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000477
Guido van Rossum48766511996-03-28 18:45:04 +0000478 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000479 self.flush()
Walter Dörwald70a6b492004-02-12 17:35:32 +0000480 r = repr(data)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000481 if len(r) > 68:
482 r = r[:32] + '...' + r[-32:]
483 print 'comment:', r
Guido van Rossum7c750e11995-02-27 13:16:55 +0000484
Guido van Rossum48766511996-03-28 18:45:04 +0000485 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000486 self.flush()
487 if not attrs:
488 print 'start tag: <' + tag + '>'
489 else:
490 print 'start tag: <' + tag,
491 for name, value in attrs:
492 print name + '=' + '"' + value + '"',
493 print '>'
Guido van Rossum7c750e11995-02-27 13:16:55 +0000494
Guido van Rossum48766511996-03-28 18:45:04 +0000495 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000496 self.flush()
497 print 'end tag: </' + tag + '>'
Guido van Rossum48766511996-03-28 18:45:04 +0000498
499 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000500 self.flush()
501 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000502
503 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000504 self.flush()
505 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000506
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000507 def unknown_decl(self, data):
508 self.flush()
509 print '*** unknown decl: [' + data + ']'
510
Guido van Rossum48766511996-03-28 18:45:04 +0000511 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000512 SGMLParser.close(self)
513 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000514
515
Guido van Rossum48766511996-03-28 18:45:04 +0000516def test(args = None):
517 import sys
518
Raymond Hettingerf13eb552002-06-02 00:40:05 +0000519 if args is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000520 args = sys.argv[1:]
Guido van Rossum48766511996-03-28 18:45:04 +0000521
522 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000523 args = args[1:]
524 klass = SGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000525 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000526 klass = TestSGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000527
528 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000529 file = args[0]
Guido van Rossum48766511996-03-28 18:45:04 +0000530 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000531 file = 'test.html'
Guido van Rossum48766511996-03-28 18:45:04 +0000532
533 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000534 f = sys.stdin
Guido van Rossum48766511996-03-28 18:45:04 +0000535 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000536 try:
537 f = open(file, 'r')
538 except IOError, msg:
539 print file, ":", msg
540 sys.exit(1)
Guido van Rossum48766511996-03-28 18:45:04 +0000541
542 data = f.read()
543 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000544 f.close()
Guido van Rossum48766511996-03-28 18:45:04 +0000545
546 x = klass()
547 for c in data:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000548 x.feed(c)
Guido van Rossum48766511996-03-28 18:45:04 +0000549 x.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000550
551
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000552if __name__ == '__main__':
Guido van Rossum48766511996-03-28 18:45:04 +0000553 test()