blob: 3020d119811ddf8b1f8bc3dfc466b247f6330252 [file] [log] [blame]
Guido van Rossume7b146f2000-02-04 15:28:42 +00001"""A parser for SGML, using the derived class as a static DTD."""
Guido van Rossum7c750e11995-02-27 13:16:55 +00002
3# XXX This only supports those SGML features used by HTML.
4
5# XXX There should be a way to distinguish between PCDATA (parsed
6# character data -- the normal case), RCDATA (replaceable character
7# data -- only char and entity references and end tags are special)
Fred Drakefb38c762001-07-16 18:30:35 +00008# and CDATA (character data -- only end tags are special). RCDATA is
9# not supported at all.
Guido van Rossum7c750e11995-02-27 13:16:55 +000010
11
Fred Drakea3bae332001-09-24 20:15:51 +000012import markupbase
Guido van Rossum1fef1811997-10-23 19:09:21 +000013import re
Guido van Rossum7c750e11995-02-27 13:16:55 +000014
Fred Drake58ae8302004-09-09 01:49:58 +000015__all__ = ["SGMLParser", "SGMLParseError"]
Guido van Rossum7c750e11995-02-27 13:16:55 +000016
17# Regular expressions used for parsing
18
Guido van Rossum1fef1811997-10-23 19:09:21 +000019interesting = re.compile('[&<]')
20incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000021 '<([a-zA-Z][^<>]*|'
22 '/([a-zA-Z][^<>]*)?|'
23 '![^<>]*)?')
Guido van Rossum48766511996-03-28 18:45:04 +000024
Guido van Rossum1ad00711998-05-28 22:48:53 +000025entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
Guido van Rossum1fef1811997-10-23 19:09:21 +000026charref = re.compile('&#([0-9]+)[^0-9]')
Guido van Rossum48766511996-03-28 18:45:04 +000027
Guido van Rossum1fef1811997-10-23 19:09:21 +000028starttagopen = re.compile('<[>a-zA-Z]')
Guido van Rossum5fdf8521998-08-24 20:59:13 +000029shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
30shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
Guido van Rossum1ad00711998-05-28 22:48:53 +000031piclose = re.compile('>')
Thomas Wouters0e3f5912006-08-11 14:57:12 +000032starttag = re.compile(r'<[a-zA-Z][-_.:a-zA-Z0-9]*\s*('
33 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
34 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~@]'
35 r'[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*(?=[\s>/<])))?'
36 r')*\s*/?\s*(?=[<>])')
37endtag = re.compile(r'</?[a-zA-Z][-_.:a-zA-Z0-9]*\s*/?\s*(?=[<>])')
Fred Drakedc191632001-07-05 18:21:57 +000038tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
Guido van Rossum1fef1811997-10-23 19:09:21 +000039attrfind = re.compile(
Fred Drake8600b472001-07-14 05:50:33 +000040 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
Thomas Wouters0e3f5912006-08-11 14:57:12 +000041 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
Guido van Rossum7c750e11995-02-27 13:16:55 +000042
Fred Drake66957372001-03-16 20:04:57 +000043
44class SGMLParseError(RuntimeError):
45 """Exception raised for all parse errors."""
46 pass
47
Guido van Rossum7c750e11995-02-27 13:16:55 +000048
49# SGML parser base class -- find tags and call handler functions.
50# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
51# The dtd is defined by deriving a class which defines methods
52# with special names to handle tags: start_foo and end_foo to handle
53# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
54# (Tags are converted to lower case for this purpose.) The data
55# between tags is passed to the parser by calling self.handle_data()
Jeremy Hyltona05e2932000-06-28 14:48:01 +000056# with some data as argument (the data may be split up in arbitrary
Guido van Rossum7c750e11995-02-27 13:16:55 +000057# chunks). Entity references are passed by calling
58# self.handle_entityref() with the entity reference as argument.
59
Fred Drakea3bae332001-09-24 20:15:51 +000060class SGMLParser(markupbase.ParserBase):
Thomas Wouters0e3f5912006-08-11 14:57:12 +000061 # Definition of entities -- derived classes may override
62 entity_or_charref = re.compile('&(?:'
63 '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
64 ')(;?)')
Guido van Rossum7c750e11995-02-27 13:16:55 +000065
Guido van Rossum48766511996-03-28 18:45:04 +000066 def __init__(self, verbose=0):
Fred Drake08f8dd62001-07-19 20:08:04 +000067 """Initialize and reset this instance."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000068 self.verbose = verbose
69 self.reset()
Guido van Rossum7c750e11995-02-27 13:16:55 +000070
Guido van Rossum48766511996-03-28 18:45:04 +000071 def reset(self):
Fred Drake08f8dd62001-07-19 20:08:04 +000072 """Reset this instance. Loses all unprocessed data."""
Martin v. Löwisdc14ab12003-09-20 10:58:38 +000073 self.__starttag_text = None
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000074 self.rawdata = ''
75 self.stack = []
76 self.lasttag = '???'
77 self.nomoretags = 0
78 self.literal = 0
Fred Drakea3bae332001-09-24 20:15:51 +000079 markupbase.ParserBase.reset(self)
Guido van Rossum7c750e11995-02-27 13:16:55 +000080
Guido van Rossum48766511996-03-28 18:45:04 +000081 def setnomoretags(self):
Fred Drake390e9db2001-07-19 20:57:23 +000082 """Enter literal mode (CDATA) till EOF.
83
84 Intended for derived classes only.
85 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000086 self.nomoretags = self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000087
Guido van Rossum48766511996-03-28 18:45:04 +000088 def setliteral(self, *args):
Fred Drake390e9db2001-07-19 20:57:23 +000089 """Enter literal mode (CDATA).
90
91 Intended for derived classes only.
92 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +000093 self.literal = 1
Guido van Rossum7c750e11995-02-27 13:16:55 +000094
Guido van Rossum48766511996-03-28 18:45:04 +000095 def feed(self, data):
Fred Drake390e9db2001-07-19 20:57:23 +000096 """Feed some data to the parser.
97
98 Call this as often as you want, with as little or as much text
99 as you want (may include '\n'). (This just saves the text,
100 all the processing is done by goahead().)
101 """
102
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000103 self.rawdata = self.rawdata + data
104 self.goahead(0)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000105
Guido van Rossum48766511996-03-28 18:45:04 +0000106 def close(self):
Fred Drake08f8dd62001-07-19 20:08:04 +0000107 """Handle the remaining data."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000108 self.goahead(1)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000109
Fred Drakea3bae332001-09-24 20:15:51 +0000110 def error(self, message):
111 raise SGMLParseError(message)
112
Guido van Rossum48766511996-03-28 18:45:04 +0000113 # Internal -- handle data as far as reasonable. May leave state
114 # and data to be processed by a subsequent call. If 'end' is
115 # true, force handling all data as if followed by EOF marker.
116 def goahead(self, end):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000117 rawdata = self.rawdata
118 i = 0
119 n = len(rawdata)
120 while i < n:
121 if self.nomoretags:
122 self.handle_data(rawdata[i:n])
123 i = n
124 break
125 match = interesting.search(rawdata, i)
Fred Drakea3bae332001-09-24 20:15:51 +0000126 if match: j = match.start()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000127 else: j = n
Fred Drakea3bae332001-09-24 20:15:51 +0000128 if i < j:
129 self.handle_data(rawdata[i:j])
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000130 i = j
131 if i == n: break
132 if rawdata[i] == '<':
133 if starttagopen.match(rawdata, i):
134 if self.literal:
135 self.handle_data(rawdata[i])
136 i = i+1
137 continue
138 k = self.parse_starttag(i)
139 if k < 0: break
140 i = k
141 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000142 if rawdata.startswith("</", i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000143 k = self.parse_endtag(i)
144 if k < 0: break
Fred Drakea3bae332001-09-24 20:15:51 +0000145 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000146 self.literal = 0
147 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000148 if self.literal:
149 if n > (i + 1):
150 self.handle_data("<")
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000151 i = i+1
Fred Drakea3bae332001-09-24 20:15:51 +0000152 else:
153 # incomplete
154 break
155 continue
156 if rawdata.startswith("<!--", i):
Tim Peters0eadaac2003-04-24 16:02:54 +0000157 # Strictly speaking, a comment is --.*--
158 # within a declaration tag <!...>.
159 # This should be removed,
160 # and comments handled only in parse_declaration.
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000161 k = self.parse_comment(i)
162 if k < 0: break
Fred Drakea3bae332001-09-24 20:15:51 +0000163 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000164 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000165 if rawdata.startswith("<?", i):
Guido van Rossum1ad00711998-05-28 22:48:53 +0000166 k = self.parse_pi(i)
167 if k < 0: break
168 i = i+k
Tim Peters495ad3c2001-01-15 01:36:40 +0000169 continue
Fred Drakea3bae332001-09-24 20:15:51 +0000170 if rawdata.startswith("<!", i):
Fred Drake66957372001-03-16 20:04:57 +0000171 # This is some sort of declaration; in "HTML as
172 # deployed," this should only be the document type
173 # declaration ("<!DOCTYPE html...>").
174 k = self.parse_declaration(i)
175 if k < 0: break
176 i = k
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000177 continue
178 elif rawdata[i] == '&':
Fred Drakefb38c762001-07-16 18:30:35 +0000179 if self.literal:
180 self.handle_data(rawdata[i])
181 i = i+1
182 continue
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000183 match = charref.match(rawdata, i)
184 if match:
185 name = match.group(1)
186 self.handle_charref(name)
187 i = match.end(0)
188 if rawdata[i-1] != ';': i = i-1
189 continue
190 match = entityref.match(rawdata, i)
191 if match:
192 name = match.group(1)
193 self.handle_entityref(name)
194 i = match.end(0)
195 if rawdata[i-1] != ';': i = i-1
196 continue
197 else:
Fred Drakea3bae332001-09-24 20:15:51 +0000198 self.error('neither < nor & ??')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000199 # We get here only if incomplete matches but
200 # nothing else
201 match = incomplete.match(rawdata, i)
202 if not match:
203 self.handle_data(rawdata[i])
204 i = i+1
205 continue
206 j = match.end(0)
207 if j == n:
208 break # Really incomplete
209 self.handle_data(rawdata[i:j])
210 i = j
211 # end while
212 if end and i < n:
213 self.handle_data(rawdata[i:n])
214 i = n
215 self.rawdata = rawdata[i:]
216 # XXX if end: check for empty stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000217
Fred Drakea3bae332001-09-24 20:15:51 +0000218 # Extensions for the DOCTYPE scanner:
219 _decl_otherchars = '='
Fred Drake66957372001-03-16 20:04:57 +0000220
Guido van Rossum1ad00711998-05-28 22:48:53 +0000221 # Internal -- parse processing instr, return length or -1 if not terminated
222 def parse_pi(self, i):
223 rawdata = self.rawdata
Fred Drake8152d322000-12-12 23:20:45 +0000224 if rawdata[i:i+2] != '<?':
Fred Drakea3bae332001-09-24 20:15:51 +0000225 self.error('unexpected call to parse_pi()')
Guido van Rossum1ad00711998-05-28 22:48:53 +0000226 match = piclose.search(rawdata, i+2)
227 if not match:
228 return -1
229 j = match.start(0)
230 self.handle_pi(rawdata[i+2: j])
231 j = match.end(0)
232 return j-i
Fred Drakeb46696c2000-06-29 18:50:59 +0000233
Fred Drakeb46696c2000-06-29 18:50:59 +0000234 def get_starttag_text(self):
235 return self.__starttag_text
Tim Peters495ad3c2001-01-15 01:36:40 +0000236
Guido van Rossum48766511996-03-28 18:45:04 +0000237 # Internal -- handle starttag, return length or -1 if not terminated
238 def parse_starttag(self, i):
Fred Drakeb46696c2000-06-29 18:50:59 +0000239 self.__starttag_text = None
240 start_pos = i
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000241 rawdata = self.rawdata
242 if shorttagopen.match(rawdata, i):
243 # SGML shorthand: <tag/data/ == <tag>data</tag>
244 # XXX Can data contain &... (entity or char refs)?
245 # XXX Can data contain < or > (tag characters)?
246 # XXX Can there be whitespace before the first /?
247 match = shorttag.match(rawdata, i)
248 if not match:
249 return -1
250 tag, data = match.group(1, 2)
Fred Drakeb46696c2000-06-29 18:50:59 +0000251 self.__starttag_text = '<%s/' % tag
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000252 tag = tag.lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000253 k = match.end(0)
Fred Drakeb46696c2000-06-29 18:50:59 +0000254 self.finish_shorttag(tag, data)
255 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000256 return k
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000257 match = starttag.match(rawdata, i)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000258 if not match:
259 return -1
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000260 j = match.end(0)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000261 # Now parse the data between i+1 and j into a tag and attrs
262 attrs = []
263 if rawdata[i:i+2] == '<>':
264 # SGML shorthand: <> == <last open tag seen>
265 k = j
266 tag = self.lasttag
267 else:
268 match = tagfind.match(rawdata, i+1)
269 if not match:
Fred Drakea3bae332001-09-24 20:15:51 +0000270 self.error('unexpected call to parse_starttag')
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000271 k = match.end(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000272 tag = rawdata[i+1:k].lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000273 self.lasttag = tag
274 while k < j:
275 match = attrfind.match(rawdata, k)
276 if not match: break
277 attrname, rest, attrvalue = match.group(1, 2, 3)
278 if not rest:
279 attrvalue = attrname
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000280 else:
281 if (attrvalue[:1] == "'" == attrvalue[-1:] or
282 attrvalue[:1] == '"' == attrvalue[-1:]):
283 # strip quotes
284 attrvalue = attrvalue[1:-1]
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000285 attrvalue = self.entity_or_charref.sub(
286 self._convert_ref, attrvalue)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000287 attrs.append((attrname.lower(), attrvalue))
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000288 k = match.end(0)
289 if rawdata[j] == '>':
290 j = j+1
Fred Drakeb46696c2000-06-29 18:50:59 +0000291 self.__starttag_text = rawdata[start_pos:j]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000292 self.finish_starttag(tag, attrs)
293 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000294
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000295 # Internal -- convert entity or character reference
296 def _convert_ref(self, match):
297 if match.group(2):
298 return self.convert_charref(match.group(2)) or \
299 '&#%s%s' % match.groups()[1:]
300 elif match.group(3):
301 return self.convert_entityref(match.group(1)) or \
302 '&%s;' % match.group(1)
303 else:
304 return '&%s' % match.group(1)
305
Guido van Rossum48766511996-03-28 18:45:04 +0000306 # Internal -- parse endtag
307 def parse_endtag(self, i):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000308 rawdata = self.rawdata
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000309 match = endtag.match(rawdata, i)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000310 if not match:
311 return -1
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000312 j = match.end(0)
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000313 tag = rawdata[i+2:j].strip().lower()
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000314 if rawdata[j] == '>':
315 j = j+1
316 self.finish_endtag(tag)
317 return j
Guido van Rossum48766511996-03-28 18:45:04 +0000318
319 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
320 def finish_shorttag(self, tag, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000321 self.finish_starttag(tag, [])
322 self.handle_data(data)
323 self.finish_endtag(tag)
Guido van Rossum48766511996-03-28 18:45:04 +0000324
325 # Internal -- finish processing of start tag
326 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
327 def finish_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000328 try:
329 method = getattr(self, 'start_' + tag)
330 except AttributeError:
331 try:
332 method = getattr(self, 'do_' + tag)
333 except AttributeError:
334 self.unknown_starttag(tag, attrs)
335 return -1
336 else:
337 self.handle_starttag(tag, method, attrs)
338 return 0
339 else:
340 self.stack.append(tag)
341 self.handle_starttag(tag, method, attrs)
342 return 1
Guido van Rossum48766511996-03-28 18:45:04 +0000343
344 # Internal -- finish processing of end tag
345 def finish_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000346 if not tag:
347 found = len(self.stack) - 1
348 if found < 0:
349 self.unknown_endtag(tag)
350 return
351 else:
352 if tag not in self.stack:
353 try:
354 method = getattr(self, 'end_' + tag)
355 except AttributeError:
356 self.unknown_endtag(tag)
Guido van Rossumb84ef9b1998-07-07 22:46:11 +0000357 else:
358 self.report_unbalanced(tag)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000359 return
360 found = len(self.stack)
361 for i in range(found):
362 if self.stack[i] == tag: found = i
363 while len(self.stack) > found:
364 tag = self.stack[-1]
365 try:
366 method = getattr(self, 'end_' + tag)
367 except AttributeError:
368 method = None
369 if method:
370 self.handle_endtag(tag, method)
371 else:
372 self.unknown_endtag(tag)
373 del self.stack[-1]
Guido van Rossum7c750e11995-02-27 13:16:55 +0000374
Guido van Rossum48766511996-03-28 18:45:04 +0000375 # Overridable -- handle start tag
376 def handle_starttag(self, tag, method, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000377 method(attrs)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000378
Guido van Rossum48766511996-03-28 18:45:04 +0000379 # Overridable -- handle end tag
380 def handle_endtag(self, tag, method):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000381 method()
Guido van Rossumefe5ac41995-06-22 18:56:36 +0000382
Guido van Rossum48766511996-03-28 18:45:04 +0000383 # Example -- report an unbalanced </...> tag.
384 def report_unbalanced(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000385 if self.verbose:
386 print '*** Unbalanced </' + tag + '>'
387 print '*** Stack:', self.stack
Guido van Rossum7c750e11995-02-27 13:16:55 +0000388
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000389 def convert_charref(self, name):
390 """Convert character reference, may be overridden."""
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000391 try:
Eric S. Raymond1b645e82001-02-09 07:49:30 +0000392 n = int(name)
Eric S. Raymond18af5642001-02-09 10:12:19 +0000393 except ValueError:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000394 return
395 if not 0 <= n <= 255:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000396 return
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000397 return self.convert_codepoint(n)
398
399 def convert_codepoint(self, codepoint):
400 return chr(codepoint)
401
402 def handle_charref(self, name):
403 """Handle character reference, no need to override."""
404 replacement = self.convert_charref(name)
405 if replacement is None:
406 self.unknown_charref(name)
407 else:
408 self.handle_data(replacement)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000409
Guido van Rossum48766511996-03-28 18:45:04 +0000410 # Definition of entities -- derived classes may override
411 entitydefs = \
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000412 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
Guido van Rossum7c750e11995-02-27 13:16:55 +0000413
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000414 def convert_entityref(self, name):
415 """Convert entity references.
Fred Drake08f8dd62001-07-19 20:08:04 +0000416
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000417 As an alternative to overriding this method; one can tailor the
418 results by setting up the self.entitydefs mapping appropriately.
Fred Drake08f8dd62001-07-19 20:08:04 +0000419 """
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000420 table = self.entitydefs
Raymond Hettinger54f02222002-06-01 14:18:47 +0000421 if name in table:
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000422 return table[name]
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000423 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000424 return
Guido van Rossum7c750e11995-02-27 13:16:55 +0000425
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000426 def handle_entityref(self, name):
427 """Handle entity references, no need to override."""
428 replacement = self.convert_entityref(name)
429 if replacement is None:
430 self.unknown_entityref(name)
431 else:
432 self.handle_data(self.convert_entityref(name))
433
Guido van Rossum48766511996-03-28 18:45:04 +0000434 # Example -- handle data, should be overridden
435 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000436 pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000437
Guido van Rossum48766511996-03-28 18:45:04 +0000438 # Example -- handle comment, could be overridden
439 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000440 pass
Guido van Rossum48766511996-03-28 18:45:04 +0000441
Fred Drake66957372001-03-16 20:04:57 +0000442 # Example -- handle declaration, could be overridden
443 def handle_decl(self, decl):
444 pass
445
Guido van Rossum1ad00711998-05-28 22:48:53 +0000446 # Example -- handle processing instruction, could be overridden
447 def handle_pi(self, data):
448 pass
449
Guido van Rossum48766511996-03-28 18:45:04 +0000450 # To be overridden -- handlers for unknown objects
451 def unknown_starttag(self, tag, attrs): pass
452 def unknown_endtag(self, tag): pass
453 def unknown_charref(self, ref): pass
454 def unknown_entityref(self, ref): pass
Guido van Rossum7c750e11995-02-27 13:16:55 +0000455
456
Guido van Rossum48766511996-03-28 18:45:04 +0000457class TestSGMLParser(SGMLParser):
Guido van Rossum7c750e11995-02-27 13:16:55 +0000458
Guido van Rossum48766511996-03-28 18:45:04 +0000459 def __init__(self, verbose=0):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000460 self.testdata = ""
461 SGMLParser.__init__(self, verbose)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000462
Guido van Rossum48766511996-03-28 18:45:04 +0000463 def handle_data(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000464 self.testdata = self.testdata + data
Walter Dörwald70a6b492004-02-12 17:35:32 +0000465 if len(repr(self.testdata)) >= 70:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000466 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000467
Guido van Rossum48766511996-03-28 18:45:04 +0000468 def flush(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000469 data = self.testdata
470 if data:
471 self.testdata = ""
Walter Dörwald70a6b492004-02-12 17:35:32 +0000472 print 'data:', repr(data)
Guido van Rossum7c750e11995-02-27 13:16:55 +0000473
Guido van Rossum48766511996-03-28 18:45:04 +0000474 def handle_comment(self, data):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000475 self.flush()
Walter Dörwald70a6b492004-02-12 17:35:32 +0000476 r = repr(data)
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000477 if len(r) > 68:
478 r = r[:32] + '...' + r[-32:]
479 print 'comment:', r
Guido van Rossum7c750e11995-02-27 13:16:55 +0000480
Guido van Rossum48766511996-03-28 18:45:04 +0000481 def unknown_starttag(self, tag, attrs):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000482 self.flush()
483 if not attrs:
484 print 'start tag: <' + tag + '>'
485 else:
486 print 'start tag: <' + tag,
487 for name, value in attrs:
488 print name + '=' + '"' + value + '"',
489 print '>'
Guido van Rossum7c750e11995-02-27 13:16:55 +0000490
Guido van Rossum48766511996-03-28 18:45:04 +0000491 def unknown_endtag(self, tag):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000492 self.flush()
493 print 'end tag: </' + tag + '>'
Guido van Rossum48766511996-03-28 18:45:04 +0000494
495 def unknown_entityref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000496 self.flush()
497 print '*** unknown entity ref: &' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000498
499 def unknown_charref(self, ref):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000500 self.flush()
501 print '*** unknown char ref: &#' + ref + ';'
Guido van Rossum48766511996-03-28 18:45:04 +0000502
Martin v. Löwis3163a3b2003-03-30 14:25:40 +0000503 def unknown_decl(self, data):
504 self.flush()
505 print '*** unknown decl: [' + data + ']'
506
Guido van Rossum48766511996-03-28 18:45:04 +0000507 def close(self):
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000508 SGMLParser.close(self)
509 self.flush()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000510
511
Guido van Rossum48766511996-03-28 18:45:04 +0000512def test(args = None):
513 import sys
514
Raymond Hettingerf13eb552002-06-02 00:40:05 +0000515 if args is None:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000516 args = sys.argv[1:]
Guido van Rossum48766511996-03-28 18:45:04 +0000517
518 if args and args[0] == '-s':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000519 args = args[1:]
520 klass = SGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000521 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000522 klass = TestSGMLParser
Guido van Rossum48766511996-03-28 18:45:04 +0000523
524 if args:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000525 file = args[0]
Guido van Rossum48766511996-03-28 18:45:04 +0000526 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000527 file = 'test.html'
Guido van Rossum48766511996-03-28 18:45:04 +0000528
529 if file == '-':
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000530 f = sys.stdin
Guido van Rossum48766511996-03-28 18:45:04 +0000531 else:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000532 try:
533 f = open(file, 'r')
534 except IOError, msg:
535 print file, ":", msg
536 sys.exit(1)
Guido van Rossum48766511996-03-28 18:45:04 +0000537
538 data = f.read()
539 if f is not sys.stdin:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000540 f.close()
Guido van Rossum48766511996-03-28 18:45:04 +0000541
542 x = klass()
543 for c in data:
Guido van Rossum45e2fbc1998-03-26 21:13:24 +0000544 x.feed(c)
Guido van Rossum48766511996-03-28 18:45:04 +0000545 x.close()
Guido van Rossum7c750e11995-02-27 13:16:55 +0000546
547
Guido van Rossum1dba24e1995-03-04 22:28:49 +0000548if __name__ == '__main__':
Guido van Rossum48766511996-03-28 18:45:04 +0000549 test()