Lib/sgmllib.py - platform/external/python/cpython3 - Gitiles

 # A parser for SGML, using the derived class as static DTD.

 # XXX This only supports those SGML features used by HTML.

 # XXX There should be a way to distinguish between PCDATA (parsed
 # character data -- the normal case), RCDATA (replaceable character
 # data -- only char and entity references and end tags are special)
 # and CDATA (character data -- only end tags are special).


 import re
 import string


 # Regular expressions used for parsing

 interesting = re.compile('[&<]')
 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
                            '<([a-zA-Z][^<>]*|'
                               '/([a-zA-Z][^<>]*)?|'
                               '![^<>]*)?')

 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
 charref = re.compile('&#([0-9]+)[^0-9]')

 starttagopen = re.compile('<[>a-zA-Z]')
 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
 piopen = re.compile('<\?')
 piclose = re.compile('>')
 endtagopen = re.compile('</[<>a-zA-Z]')
 endbracket = re.compile('[<>]')
 special = re.compile('<![^<>]*>')
 commentopen = re.compile('<!--')
 commentclose = re.compile('--[%s]*>' % string.whitespace)
 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9]*')
 attrfind = re.compile(
     '[%s]+([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace
     + ('([%s]*=[%s]*' % (string.whitespace, string.whitespace))
     + r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:+*%?!\(\)_#=~]*))?')


 # SGML parser base class -- find tags and call handler functions.
 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
 # The dtd is defined by deriving a class which defines methods
 # with special names to handle tags: start_foo and end_foo to handle
 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
 # (Tags are converted to lower case for this purpose.)  The data
 # between tags is passed to the parser by calling self.handle_data()
 # with some data as argument (the data may be split up in arbutrary
 # chunks).  Entity references are passed by calling
 # self.handle_entityref() with the entity reference as argument.

 class SGMLParser:

     # Interface -- initialize and reset this instance
     def __init__(self, verbose=0):
         self.verbose = verbose
         self.reset()

     # Interface -- reset this instance.  Loses all unprocessed data
     def reset(self):
         self.rawdata = ''
         self.stack = []
         self.lasttag = '???'
         self.nomoretags = 0
         self.literal = 0

     # For derived classes only -- enter literal mode (CDATA) till EOF
     def setnomoretags(self):
         self.nomoretags = self.literal = 1

     # For derived classes only -- enter literal mode (CDATA)
     def setliteral(self, *args):
         self.literal = 1

     # Interface -- feed some data to the parser.  Call this as
     # often as you want, with as little or as much text as you
     # want (may include '\n').  (This just saves the text, all the
     # processing is done by goahead().)
     def feed(self, data):
         self.rawdata = self.rawdata + data
         self.goahead(0)

     # Interface -- handle the remaining data
     def close(self):
         self.goahead(1)

     # Internal -- handle data as far as reasonable.  May leave state
     # and data to be processed by a subsequent call.  If 'end' is
     # true, force handling all data as if followed by EOF marker.
     def goahead(self, end):
         rawdata = self.rawdata
         i = 0
         n = len(rawdata)
         while i < n:
             if self.nomoretags:
                 self.handle_data(rawdata[i:n])
                 i = n
                 break
             match = interesting.search(rawdata, i)
             if match: j = match.start(0)
             else: j = n
             if i < j: self.handle_data(rawdata[i:j])
             i = j
             if i == n: break
             if rawdata[i] == '<':
                 if starttagopen.match(rawdata, i):
                     if self.literal:
                         self.handle_data(rawdata[i])
                         i = i+1
                         continue
                     k = self.parse_starttag(i)
                     if k < 0: break
                     i = k
                     continue
                 if endtagopen.match(rawdata, i):
                     k = self.parse_endtag(i)
                     if k < 0: break
                     i =  k
                     self.literal = 0
                     continue
                 if commentopen.match(rawdata, i):
                     if self.literal:
                         self.handle_data(rawdata[i])
                         i = i+1
                         continue
                     k = self.parse_comment(i)
                     if k < 0: break
                     i = i+k
                     continue
                 if piopen.match(rawdata, i):
                     if self.literal:
                         self.handle_data(rawdata[i])
                         i = i+1
                         continue
                     k = self.parse_pi(i)
                     if k < 0: break
                     i = i+k
                     continue
                 match = special.match(rawdata, i)
                 if match:
                     if self.literal:
                         self.handle_data(rawdata[i])
                         i = i+1
                         continue
                     i = match.end(0)
                     continue
             elif rawdata[i] == '&':
                 match = charref.match(rawdata, i)
                 if match:
                     name = match.group(1)
                     self.handle_charref(name)
                     i = match.end(0)
                     if rawdata[i-1] != ';': i = i-1
                     continue
                 match = entityref.match(rawdata, i)
                 if match:
                     name = match.group(1)
                     self.handle_entityref(name)
                     i = match.end(0)
                     if rawdata[i-1] != ';': i = i-1
                     continue
             else:
                 raise RuntimeError, 'neither < nor & ??'
             # We get here only if incomplete matches but
             # nothing else
             match = incomplete.match(rawdata, i)
             if not match:
                 self.handle_data(rawdata[i])
                 i = i+1
                 continue
             j = match.end(0)
             if j == n:
                 break # Really incomplete
             self.handle_data(rawdata[i:j])
             i = j
         # end while
         if end and i < n:
             self.handle_data(rawdata[i:n])
             i = n
         self.rawdata = rawdata[i:]
         # XXX if end: check for empty stack

     # Internal -- parse comment, return length or -1 if not terminated
     def parse_comment(self, i):
         rawdata = self.rawdata
         if rawdata[i:i+4] <> '<!--':
             raise RuntimeError, 'unexpected call to handle_comment'
         match = commentclose.search(rawdata, i+4)
         if not match:
             return -1
         j = match.start(0)
         self.handle_comment(rawdata[i+4: j])
         j = match.end(0)
         return j-i

     # Internal -- parse processing instr, return length or -1 if not terminated
     def parse_pi(self, i):
         rawdata = self.rawdata
         if rawdata[i:i+2] <> '<?':
             raise RuntimeError, 'unexpected call to handle_pi'
         match = piclose.search(rawdata, i+2)
         if not match:
             return -1
         j = match.start(0)
         self.handle_pi(rawdata[i+2: j])
         j = match.end(0)
         return j-i

     # Internal -- handle starttag, return length or -1 if not terminated
     def parse_starttag(self, i):
         rawdata = self.rawdata
         if shorttagopen.match(rawdata, i):
             # SGML shorthand: <tag/data/ == <tag>data</tag>
             # XXX Can data contain &... (entity or char refs)?
             # XXX Can data contain < or > (tag characters)?
             # XXX Can there be whitespace before the first /?
             match = shorttag.match(rawdata, i)
             if not match:
                 return -1
             tag, data = match.group(1, 2)
             tag = string.lower(tag)
             self.finish_shorttag(tag, data)
             k = match.end(0)
             return k
         # XXX The following should skip matching quotes (' or ")
         match = endbracket.search(rawdata, i+1)
         if not match:
             return -1
         j = match.start(0)
         # Now parse the data between i+1 and j into a tag and attrs
         attrs = []
         if rawdata[i:i+2] == '<>':
             # SGML shorthand: <> == <last open tag seen>
             k = j
             tag = self.lasttag
         else:
             match = tagfind.match(rawdata, i+1)
             if not match:
                 raise RuntimeError, 'unexpected call to parse_starttag'
             k = match.end(0)
             tag = string.lower(rawdata[i+1:k])
             self.lasttag = tag
         while k < j:
             match = attrfind.match(rawdata, k)
             if not match: break
             attrname, rest, attrvalue = match.group(1, 2, 3)
             if not rest:
                 attrvalue = attrname
             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                  attrvalue[:1] == '"' == attrvalue[-1:]:
                 attrvalue = attrvalue[1:-1]
             attrs.append((string.lower(attrname), attrvalue))
             k = match.end(0)
         if rawdata[j] == '>':
             j = j+1
         self.finish_starttag(tag, attrs)
         return j

     # Internal -- parse endtag
     def parse_endtag(self, i):
         rawdata = self.rawdata
         match = endbracket.search(rawdata, i+1)
         if not match:
             return -1
         j = match.start(0)
         tag = string.lower(string.strip(rawdata[i+2:j]))
         if rawdata[j] == '>':
             j = j+1
         self.finish_endtag(tag)
         return j

     # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
     def finish_shorttag(self, tag, data):
         self.finish_starttag(tag, [])
         self.handle_data(data)
         self.finish_endtag(tag)

     # Internal -- finish processing of start tag
     # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
     def finish_starttag(self, tag, attrs):
         try:
             method = getattr(self, 'start_' + tag)
         except AttributeError:
             try:
                 method = getattr(self, 'do_' + tag)
             except AttributeError:
                 self.unknown_starttag(tag, attrs)
                 return -1
             else:
                 self.handle_starttag(tag, method, attrs)
                 return 0
         else:
             self.stack.append(tag)
             self.handle_starttag(tag, method, attrs)
             return 1

     # Internal -- finish processing of end tag
     def finish_endtag(self, tag):
         if not tag:
             found = len(self.stack) - 1
             if found < 0:
                 self.unknown_endtag(tag)
                 return
         else:
             if tag not in self.stack:
                 try:
                     method = getattr(self, 'end_' + tag)
                 except AttributeError:
                     self.unknown_endtag(tag)
                 else:
                     self.report_unbalanced(tag)
                 return
             found = len(self.stack)
             for i in range(found):
                 if self.stack[i] == tag: found = i
         while len(self.stack) > found:
             tag = self.stack[-1]
             try:
                 method = getattr(self, 'end_' + tag)
             except AttributeError:
                 method = None
             if method:
                 self.handle_endtag(tag, method)
             else:
                 self.unknown_endtag(tag)
             del self.stack[-1]

     # Overridable -- handle start tag
     def handle_starttag(self, tag, method, attrs):
         method(attrs)

     # Overridable -- handle end tag
     def handle_endtag(self, tag, method):
         method()

     # Example -- report an unbalanced </...> tag.
     def report_unbalanced(self, tag):
         if self.verbose:
             print '*** Unbalanced </' + tag + '>'
             print '*** Stack:', self.stack

     # Example -- handle character reference, no need to override
     def handle_charref(self, name):
         try:
             n = string.atoi(name)
         except string.atoi_error:
             self.unknown_charref(name)
             return
         if not 0 <= n <= 255:
             self.unknown_charref(name)
             return
         self.handle_data(chr(n))

     # Definition of entities -- derived classes may override
     entitydefs = \
             {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}

     # Example -- handle entity reference, no need to override
     def handle_entityref(self, name):
         table = self.entitydefs
         if table.has_key(name):
             self.handle_data(table[name])
         else:
             self.unknown_entityref(name)
             return

     # Example -- handle data, should be overridden
     def handle_data(self, data):
         pass

     # Example -- handle comment, could be overridden
     def handle_comment(self, data):
         pass

     # Example -- handle processing instruction, could be overridden
     def handle_pi(self, data):
         pass

     # To be overridden -- handlers for unknown objects
     def unknown_starttag(self, tag, attrs): pass
     def unknown_endtag(self, tag): pass
     def unknown_charref(self, ref): pass
     def unknown_entityref(self, ref): pass


 class TestSGMLParser(SGMLParser):

     def __init__(self, verbose=0):
         self.testdata = ""
         SGMLParser.__init__(self, verbose)

     def handle_data(self, data):
         self.testdata = self.testdata + data
         if len(`self.testdata`) >= 70:
             self.flush()

     def flush(self):
         data = self.testdata
         if data:
             self.testdata = ""
             print 'data:', `data`

     def handle_comment(self, data):
         self.flush()
         r = `data`
         if len(r) > 68:
             r = r[:32] + '...' + r[-32:]
         print 'comment:', r

     def unknown_starttag(self, tag, attrs):
         self.flush()
         if not attrs:
             print 'start tag: <' + tag + '>'
         else:
             print 'start tag: <' + tag,
             for name, value in attrs:
                 print name + '=' + '"' + value + '"',
             print '>'

     def unknown_endtag(self, tag):
         self.flush()
         print 'end tag: </' + tag + '>'

     def unknown_entityref(self, ref):
         self.flush()
         print '*** unknown entity ref: &' + ref + ';'

     def unknown_charref(self, ref):
         self.flush()
         print '*** unknown char ref: &#' + ref + ';'

     def close(self):
         SGMLParser.close(self)
         self.flush()


 def test(args = None):
     import sys

     if not args:
         args = sys.argv[1:]

     if args and args[0] == '-s':
         args = args[1:]
         klass = SGMLParser
     else:
         klass = TestSGMLParser

     if args:
         file = args[0]
     else:
         file = 'test.html'

     if file == '-':
         f = sys.stdin
     else:
         try:
             f = open(file, 'r')
         except IOError, msg:
             print file, ":", msg
             sys.exit(1)

     data = f.read()
     if f is not sys.stdin:
         f.close()

     x = klass()
     for c in data:
         x.feed(c)
     x.close()


 if __name__ == '__main__':
     test()
	# A parser for SGML, using the derived class as static DTD.

	# XXX This only supports those SGML features used by HTML.

	# XXX There should be a way to distinguish between PCDATA (parsed
	# character data -- the normal case), RCDATA (replaceable character
	# data -- only char and entity references and end tags are special)
	# and CDATA (character data -- only end tags are special).


	import re
	import string


	# Regular expressions used for parsing

	interesting = re.compile('[&<]')
	incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]\|#[0-9])?\|'
	'<([a-zA-Z][^<>]*\|'
	'/([a-zA-Z][^<>]*)?\|'
	'![^<>]*)?')

	entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
	charref = re.compile('&#([0-9]+)[^0-9]')

	starttagopen = re.compile('<[>a-zA-Z]')
	shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
	shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9])/([^/])/')
	piopen = re.compile('<\?')
	piclose = re.compile('>')
	endtagopen = re.compile('</[<>a-zA-Z]')
	endbracket = re.compile('[<>]')
	special = re.compile('<![^<>]*>')
	commentopen = re.compile('<!--')
	commentclose = re.compile('--[%s]*>' % string.whitespace)
	tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9]*')
	attrfind = re.compile(
	'[%s]+([a-zA-Z_][-.a-zA-Z_0-9]*)' % string.whitespace
	+ ('([%s]=[%s]' % (string.whitespace, string.whitespace))
	+ r'(\'[^\']\'\|"[^"]"\|[-a-zA-Z0-9./:+%?!\(\)_#=~]))?')


	# SGML parser base class -- find tags and call handler functions.
	# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
	# The dtd is defined by deriving a class which defines methods
	# with special names to handle tags: start_foo and end_foo to handle
	# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
	# (Tags are converted to lower case for this purpose.) The data
	# between tags is passed to the parser by calling self.handle_data()
	# with some data as argument (the data may be split up in arbutrary
	# chunks). Entity references are passed by calling
	# self.handle_entityref() with the entity reference as argument.

	class SGMLParser:

	# Interface -- initialize and reset this instance
	def __init__(self, verbose=0):
	self.verbose = verbose
	self.reset()

	# Interface -- reset this instance. Loses all unprocessed data
	def reset(self):
	self.rawdata = ''
	self.stack = []
	self.lasttag = '???'
	self.nomoretags = 0
	self.literal = 0

	# For derived classes only -- enter literal mode (CDATA) till EOF
	def setnomoretags(self):
	self.nomoretags = self.literal = 1

	# For derived classes only -- enter literal mode (CDATA)
	def setliteral(self, *args):
	self.literal = 1

	# Interface -- feed some data to the parser. Call this as
	# often as you want, with as little or as much text as you
	# want (may include '\n'). (This just saves the text, all the
	# processing is done by goahead().)
	def feed(self, data):
	self.rawdata = self.rawdata + data
	self.goahead(0)

	# Interface -- handle the remaining data
	def close(self):
	self.goahead(1)

	# Internal -- handle data as far as reasonable. May leave state
	# and data to be processed by a subsequent call. If 'end' is
	# true, force handling all data as if followed by EOF marker.
	def goahead(self, end):
	rawdata = self.rawdata
	i = 0
	n = len(rawdata)
	while i < n:
	if self.nomoretags:
	self.handle_data(rawdata[i:n])
	i = n
	break
	match = interesting.search(rawdata, i)
	if match: j = match.start(0)
	else: j = n
	if i < j: self.handle_data(rawdata[i:j])
	i = j
	if i == n: break
	if rawdata[i] == '<':
	if starttagopen.match(rawdata, i):
	if self.literal:
	self.handle_data(rawdata[i])
	i = i+1
	continue
	k = self.parse_starttag(i)
	if k < 0: break
	i = k
	continue
	if endtagopen.match(rawdata, i):
	k = self.parse_endtag(i)
	if k < 0: break
	i = k
	self.literal = 0
	continue
	if commentopen.match(rawdata, i):
	if self.literal:
	self.handle_data(rawdata[i])
	i = i+1
	continue
	k = self.parse_comment(i)
	if k < 0: break
	i = i+k
	continue
	if piopen.match(rawdata, i):
	if self.literal:
	self.handle_data(rawdata[i])
	i = i+1
	continue
	k = self.parse_pi(i)
	if k < 0: break
	i = i+k
	continue
	match = special.match(rawdata, i)
	if match:
	if self.literal:
	self.handle_data(rawdata[i])
	i = i+1
	continue
	i = match.end(0)
	continue
	elif rawdata[i] == '&':
	match = charref.match(rawdata, i)
	if match:
	name = match.group(1)
	self.handle_charref(name)
	i = match.end(0)
	if rawdata[i-1] != ';': i = i-1
	continue
	match = entityref.match(rawdata, i)
	if match:
	name = match.group(1)
	self.handle_entityref(name)
	i = match.end(0)
	if rawdata[i-1] != ';': i = i-1
	continue
	else:
	raise RuntimeError, 'neither < nor & ??'
	# We get here only if incomplete matches but
	# nothing else
	match = incomplete.match(rawdata, i)
	if not match:
	self.handle_data(rawdata[i])
	i = i+1
	continue
	j = match.end(0)
	if j == n:
	break # Really incomplete
	self.handle_data(rawdata[i:j])
	i = j
	# end while
	if end and i < n:
	self.handle_data(rawdata[i:n])
	i = n
	self.rawdata = rawdata[i:]
	# XXX if end: check for empty stack

	# Internal -- parse comment, return length or -1 if not terminated
	def parse_comment(self, i):
	rawdata = self.rawdata
	if rawdata[i:i+4] <> '<!--':
	raise RuntimeError, 'unexpected call to handle_comment'
	match = commentclose.search(rawdata, i+4)
	if not match:
	return -1
	j = match.start(0)
	self.handle_comment(rawdata[i+4: j])
	j = match.end(0)
	return j-i

	# Internal -- parse processing instr, return length or -1 if not terminated
	def parse_pi(self, i):
	rawdata = self.rawdata
	if rawdata[i:i+2] <> '<?':
	raise RuntimeError, 'unexpected call to handle_pi'
	match = piclose.search(rawdata, i+2)
	if not match:
	return -1
	j = match.start(0)
	self.handle_pi(rawdata[i+2: j])
	j = match.end(0)
	return j-i

	# Internal -- handle starttag, return length or -1 if not terminated
	def parse_starttag(self, i):
	rawdata = self.rawdata
	if shorttagopen.match(rawdata, i):
	# SGML shorthand: <tag/data/ == <tag>data</tag>
	# XXX Can data contain &... (entity or char refs)?
	# XXX Can data contain < or > (tag characters)?
	# XXX Can there be whitespace before the first /?
	match = shorttag.match(rawdata, i)
	if not match:
	return -1
	tag, data = match.group(1, 2)
	tag = string.lower(tag)
	self.finish_shorttag(tag, data)
	k = match.end(0)
	return k
	# XXX The following should skip matching quotes (' or ")
	match = endbracket.search(rawdata, i+1)
	if not match:
	return -1
	j = match.start(0)
	# Now parse the data between i+1 and j into a tag and attrs
	attrs = []
	if rawdata[i:i+2] == '<>':
	# SGML shorthand: <> == <last open tag seen>
	k = j
	tag = self.lasttag
	else:
	match = tagfind.match(rawdata, i+1)
	if not match:
	raise RuntimeError, 'unexpected call to parse_starttag'
	k = match.end(0)
	tag = string.lower(rawdata[i+1:k])
	self.lasttag = tag
	while k < j:
	match = attrfind.match(rawdata, k)
	if not match: break
	attrname, rest, attrvalue = match.group(1, 2, 3)
	if not rest:
	attrvalue = attrname
	elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
	attrvalue[:1] == '"' == attrvalue[-1:]:
	attrvalue = attrvalue[1:-1]
	attrs.append((string.lower(attrname), attrvalue))
	k = match.end(0)
	if rawdata[j] == '>':
	j = j+1
	self.finish_starttag(tag, attrs)
	return j

	# Internal -- parse endtag
	def parse_endtag(self, i):
	rawdata = self.rawdata
	match = endbracket.search(rawdata, i+1)
	if not match:
	return -1
	j = match.start(0)
	tag = string.lower(string.strip(rawdata[i+2:j]))
	if rawdata[j] == '>':
	j = j+1
	self.finish_endtag(tag)
	return j

	# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
	def finish_shorttag(self, tag, data):
	self.finish_starttag(tag, [])
	self.handle_data(data)
	self.finish_endtag(tag)

	# Internal -- finish processing of start tag
	# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
	def finish_starttag(self, tag, attrs):
	try:
	method = getattr(self, 'start_' + tag)
	except AttributeError:
	try:
	method = getattr(self, 'do_' + tag)
	except AttributeError:
	self.unknown_starttag(tag, attrs)
	return -1
	else:
	self.handle_starttag(tag, method, attrs)
	return 0
	else:
	self.stack.append(tag)
	self.handle_starttag(tag, method, attrs)
	return 1

	# Internal -- finish processing of end tag
	def finish_endtag(self, tag):
	if not tag:
	found = len(self.stack) - 1
	if found < 0:
	self.unknown_endtag(tag)
	return
	else:
	if tag not in self.stack:
	try:
	method = getattr(self, 'end_' + tag)
	except AttributeError:
	self.unknown_endtag(tag)
	else:
	self.report_unbalanced(tag)
	return
	found = len(self.stack)
	for i in range(found):
	if self.stack[i] == tag: found = i
	while len(self.stack) > found:
	tag = self.stack[-1]
	try:
	method = getattr(self, 'end_' + tag)
	except AttributeError:
	method = None
	if method:
	self.handle_endtag(tag, method)
	else:
	self.unknown_endtag(tag)
	del self.stack[-1]

	# Overridable -- handle start tag
	def handle_starttag(self, tag, method, attrs):
	method(attrs)

	# Overridable -- handle end tag
	def handle_endtag(self, tag, method):
	method()

	# Example -- report an unbalanced </...> tag.
	def report_unbalanced(self, tag):
	if self.verbose:
	print '*** Unbalanced </' + tag + '>'
	print '*** Stack:', self.stack

	# Example -- handle character reference, no need to override
	def handle_charref(self, name):
	try:
	n = string.atoi(name)
	except string.atoi_error:
	self.unknown_charref(name)
	return
	if not 0 <= n <= 255:
	self.unknown_charref(name)
	return
	self.handle_data(chr(n))

	# Definition of entities -- derived classes may override
	entitydefs = \
	{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}

	# Example -- handle entity reference, no need to override
	def handle_entityref(self, name):
	table = self.entitydefs
	if table.has_key(name):
	self.handle_data(table[name])
	else:
	self.unknown_entityref(name)
	return

	# Example -- handle data, should be overridden
	def handle_data(self, data):
	pass

	# Example -- handle comment, could be overridden
	def handle_comment(self, data):
	pass

	# Example -- handle processing instruction, could be overridden
	def handle_pi(self, data):
	pass

	# To be overridden -- handlers for unknown objects
	def unknown_starttag(self, tag, attrs): pass
	def unknown_endtag(self, tag): pass
	def unknown_charref(self, ref): pass
	def unknown_entityref(self, ref): pass


	class TestSGMLParser(SGMLParser):

	def __init__(self, verbose=0):
	self.testdata = ""
	SGMLParser.__init__(self, verbose)

	def handle_data(self, data):
	self.testdata = self.testdata + data
	if len(`self.testdata`) >= 70:
	self.flush()

	def flush(self):
	data = self.testdata
	if data:
	self.testdata = ""
	print 'data:', `data`

	def handle_comment(self, data):
	self.flush()
	r = `data`
	if len(r) > 68:
	r = r[:32] + '...' + r[-32:]
	print 'comment:', r

	def unknown_starttag(self, tag, attrs):
	self.flush()
	if not attrs:
	print 'start tag: <' + tag + '>'
	else:
	print 'start tag: <' + tag,
	for name, value in attrs:
	print name + '=' + '"' + value + '"',
	print '>'

	def unknown_endtag(self, tag):
	self.flush()
	print 'end tag: </' + tag + '>'

	def unknown_entityref(self, ref):
	self.flush()
	print '*** unknown entity ref: &' + ref + ';'

	def unknown_charref(self, ref):
	self.flush()
	print '*** unknown char ref: &#' + ref + ';'

	def close(self):
	SGMLParser.close(self)
	self.flush()


	def test(args = None):
	import sys

	if not args:
	args = sys.argv[1:]

	if args and args[0] == '-s':
	args = args[1:]
	klass = SGMLParser
	else:
	klass = TestSGMLParser

	if args:
	file = args[0]
	else:
	file = 'test.html'

	if file == '-':
	f = sys.stdin
	else:
	try:
	f = open(file, 'r')
	except IOError, msg:
	print file, ":", msg
	sys.exit(1)

	data = f.read()
	if f is not sys.stdin:
	f.close()

	x = klass()
	for c in data:
	x.feed(c)
	x.close()


	if __name__ == '__main__':
	test()