Lib/html/parser.py - platform/external/python/cpython3 - Gitiles

 """A parser for HTML and XHTML."""

 # This file is based on sgmllib.py, but the API is slightly different.

 # XXX There should be a way to distinguish between PCDATA (parsed
 # character data -- the normal case), RCDATA (replaceable character
 # data -- only char and entity references and end tags are special)
 # and CDATA (character data -- only end tags are special).


 import re
 import warnings
 import _markupbase

 from html import unescape


 __all__ = ['HTMLParser']

 # Regular expressions used for parsing

 interesting_normal = re.compile('[&<]')
 incomplete = re.compile('&[a-zA-Z#]')

 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

 starttagopen = re.compile('<[a-zA-Z]')
 piclose = re.compile('>')
 commentclose = re.compile(r'--\s*>')
 # Note:
 #  1) the strict attrfind isn't really strict, but we can't make it
 #     correctly strict without breaking backward compatibility;
 #  2) if you change tagfind/attrfind remember to update locatestarttagend too;
 #  3) if you change tagfind/attrfind and/or locatestarttagend the parser will
 #     explode, so don't do it.
 tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*')
 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
 tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
     r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')
 attrfind_tolerant = re.compile(
     r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
     r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
 locatestarttagend = re.compile(r"""
   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
   (?:\s+                             # whitespace before attribute name
     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
       (?:\s*=\s*                     # value indicator
         (?:'[^']*'                   # LITA-enclosed value
           |\"[^\"]*\"                # LIT-enclosed value
           |[^'\">\s]+                # bare value
          )
        )?
      )
    )*
   \s*                                # trailing whitespace
 """, re.VERBOSE)
 locatestarttagend_tolerant = re.compile(r"""
   <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
   (?:[\s/]*                          # optional whitespace before attribute name
     (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
       (?:\s*=+\s*                    # value indicator
         (?:'[^']*'                   # LITA-enclosed value
           |"[^"]*"                   # LIT-enclosed value
           |(?!['"])[^>\s]*           # bare value
          )
          (?:\s*,)*                   # possibly followed by a comma
        )?(?:\s|/(?!>))*
      )*
    )?
   \s*                                # trailing whitespace
 """, re.VERBOSE)
 endendtag = re.compile('>')
 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
 # </ and the tag name, so maybe this should be fixed
 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')


 class HTMLParseError(Exception):
     """Exception raised for all parse errors."""

     def __init__(self, msg, position=(None, None)):
         assert msg
         self.msg = msg
         self.lineno = position[0]
         self.offset = position[1]

     def __str__(self):
         result = self.msg
         if self.lineno is not None:
             result = result + ", at line %d" % self.lineno
         if self.offset is not None:
             result = result + ", column %d" % (self.offset + 1)
         return result


 _default_sentinel = object()

 class HTMLParser(_markupbase.ParserBase):
     """Find tags and other markup and call handler functions.

     Usage:
         p = HTMLParser()
         p.feed(data)
         ...
         p.close()

     Start tags are handled by calling self.handle_starttag() or
     self.handle_startendtag(); end tags by self.handle_endtag().  The
     data between tags is passed from the parser to the derived class
     by calling self.handle_data() with the data as argument (the data
     may be split up in arbitrary chunks).  If convert_charrefs is
     True the character references are converted automatically to the
     corresponding Unicode character (and self.handle_data() is no
     longer split in chunks), otherwise they are passed by calling
     self.handle_entityref() or self.handle_charref() with the string
     containing respectively the named or numeric reference as the
     argument.
     """

     CDATA_CONTENT_ELEMENTS = ("script", "style")

     def __init__(self, strict=_default_sentinel, *,
                  convert_charrefs=_default_sentinel):
         """Initialize and reset this instance.

         If convert_charrefs is True (default: False), all character references
         are automatically converted to the corresponding Unicode characters.
         If strict is set to False (the default) the parser will parse invalid
         markup, otherwise it will raise an error.  Note that the strict mode
         and argument are deprecated.
         """
         if strict is not _default_sentinel:
             warnings.warn("The strict argument and mode are deprecated.",
                           DeprecationWarning, stacklevel=2)
         else:
             strict = False  # default
         self.strict = strict
         if convert_charrefs is _default_sentinel:
             convert_charrefs = False  # default
             warnings.warn("The value of convert_charrefs will become True in "
                           "3.5. You are encouraged to set the value explicitly.",
                           DeprecationWarning, stacklevel=2)
         self.convert_charrefs = convert_charrefs
         self.reset()

     def reset(self):
         """Reset this instance.  Loses all unprocessed data."""
         self.rawdata = ''
         self.lasttag = '???'
         self.interesting = interesting_normal
         self.cdata_elem = None
         _markupbase.ParserBase.reset(self)

     def feed(self, data):
         r"""Feed data to the parser.

         Call this as often as you want, with as little or as much text
         as you want (may include '\n').
         """
         self.rawdata = self.rawdata + data
         self.goahead(0)

     def close(self):
         """Handle any buffered data."""
         self.goahead(1)

     def error(self, message):
         warnings.warn("The 'error' method is deprecated.",
                       DeprecationWarning, stacklevel=2)
         raise HTMLParseError(message, self.getpos())

     __starttag_text = None

     def get_starttag_text(self):
         """Return full source of start tag: '<...>'."""
         return self.__starttag_text

     def set_cdata_mode(self, elem):
         self.cdata_elem = elem.lower()
         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)

     def clear_cdata_mode(self):
         self.interesting = interesting_normal
         self.cdata_elem = None

     # Internal -- handle data as far as reasonable.  May leave state
     # and data to be processed by a subsequent call.  If 'end' is
     # true, force handling all data as if followed by EOF marker.
     def goahead(self, end):
         rawdata = self.rawdata
         i = 0
         n = len(rawdata)
         while i < n:
             if self.convert_charrefs and not self.cdata_elem:
                 j = rawdata.find('<', i)
                 if j < 0:
                     if not end:
                         break  # wait till we get all the text
                     j = n
             else:
                 match = self.interesting.search(rawdata, i)  # < or &
                 if match:
                     j = match.start()
                 else:
                     if self.cdata_elem:
                         break
                     j = n
             if i < j:
                 if self.convert_charrefs and not self.cdata_elem:
                     self.handle_data(unescape(rawdata[i:j]))
                 else:
                     self.handle_data(rawdata[i:j])
             i = self.updatepos(i, j)
             if i == n: break
             startswith = rawdata.startswith
             if startswith('<', i):
                 if starttagopen.match(rawdata, i): # < + letter
                     k = self.parse_starttag(i)
                 elif startswith("</", i):
                     k = self.parse_endtag(i)
                 elif startswith("<!--", i):
                     k = self.parse_comment(i)
                 elif startswith("<?", i):
                     k = self.parse_pi(i)
                 elif startswith("<!", i):
                     if self.strict:
                         k = self.parse_declaration(i)
                     else:
                         k = self.parse_html_declaration(i)
                 elif (i + 1) < n:
                     self.handle_data("<")
                     k = i + 1
                 else:
                     break
                 if k < 0:
                     if not end:
                         break
                     if self.strict:
                         self.error("EOF in middle of construct")
                     k = rawdata.find('>', i + 1)
                     if k < 0:
                         k = rawdata.find('<', i + 1)
                         if k < 0:
                             k = i + 1
                     else:
                         k += 1
                     if self.convert_charrefs and not self.cdata_elem:
                         self.handle_data(unescape(rawdata[i:k]))
                     else:
                         self.handle_data(rawdata[i:k])
                 i = self.updatepos(i, k)
             elif startswith("&#", i):
                 match = charref.match(rawdata, i)
                 if match:
                     name = match.group()[2:-1]
                     self.handle_charref(name)
                     k = match.end()
                     if not startswith(';', k-1):
                         k = k - 1
                     i = self.updatepos(i, k)
                     continue
                 else:
                     if ";" in rawdata[i:]:  # bail by consuming &#
                         self.handle_data(rawdata[i:i+2])
                         i = self.updatepos(i, i+2)
                     break
             elif startswith('&', i):
                 match = entityref.match(rawdata, i)
                 if match:
                     name = match.group(1)
                     self.handle_entityref(name)
                     k = match.end()
                     if not startswith(';', k-1):
                         k = k - 1
                     i = self.updatepos(i, k)
                     continue
                 match = incomplete.match(rawdata, i)
                 if match:
                     # match.group() will contain at least 2 chars
                     if end and match.group() == rawdata[i:]:
                         if self.strict:
                             self.error("EOF in middle of entity or char ref")
                         else:
                             k = match.end()
                             if k <= i:
                                 k = n
                             i = self.updatepos(i, i + 1)
                     # incomplete
                     break
                 elif (i + 1) < n:
                     # not the end of the buffer, and can't be confused
                     # with some other construct
                     self.handle_data("&")
                     i = self.updatepos(i, i + 1)
                 else:
                     break
             else:
                 assert 0, "interesting.search() lied"
         # end while
         if end and i < n and not self.cdata_elem:
             if self.convert_charrefs and not self.cdata_elem:
                 self.handle_data(unescape(rawdata[i:n]))
             else:
                 self.handle_data(rawdata[i:n])
             i = self.updatepos(i, n)
         self.rawdata = rawdata[i:]

     # Internal -- parse html declarations, return length or -1 if not terminated
     # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
     # See also parse_declaration in _markupbase
     def parse_html_declaration(self, i):
         rawdata = self.rawdata
         assert rawdata[i:i+2] == '<!', ('unexpected call to '
                                         'parse_html_declaration()')
         if rawdata[i:i+4] == '<!--':
             # this case is actually already handled in goahead()
             return self.parse_comment(i)
         elif rawdata[i:i+3] == '<![':
             return self.parse_marked_section(i)
         elif rawdata[i:i+9].lower() == '<!doctype':
             # find the closing >
             gtpos = rawdata.find('>', i+9)
             if gtpos == -1:
                 return -1
             self.handle_decl(rawdata[i+2:gtpos])
             return gtpos+1
         else:
             return self.parse_bogus_comment(i)

     # Internal -- parse bogus comment, return length or -1 if not terminated
     # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
     def parse_bogus_comment(self, i, report=1):
         rawdata = self.rawdata
         assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
                                                 'parse_comment()')
         pos = rawdata.find('>', i+2)
         if pos == -1:
             return -1
         if report:
             self.handle_comment(rawdata[i+2:pos])
         return pos + 1

     # Internal -- parse processing instr, return end or -1 if not terminated
     def parse_pi(self, i):
         rawdata = self.rawdata
         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
         match = piclose.search(rawdata, i+2) # >
         if not match:
             return -1
         j = match.start()
         self.handle_pi(rawdata[i+2: j])
         j = match.end()
         return j

     # Internal -- handle starttag, return end or -1 if not terminated
     def parse_starttag(self, i):
         self.__starttag_text = None
         endpos = self.check_for_whole_start_tag(i)
         if endpos < 0:
             return endpos
         rawdata = self.rawdata
         self.__starttag_text = rawdata[i:endpos]

         # Now parse the data between i+1 and j into a tag and attrs
         attrs = []
         if self.strict:
             match = tagfind.match(rawdata, i+1)
         else:
             match = tagfind_tolerant.match(rawdata, i+1)
         assert match, 'unexpected call to parse_starttag()'
         k = match.end()
         self.lasttag = tag = match.group(1).lower()
         while k < endpos:
             if self.strict:
                 m = attrfind.match(rawdata, k)
             else:
                 m = attrfind_tolerant.match(rawdata, k)
             if not m:
                 break
             attrname, rest, attrvalue = m.group(1, 2, 3)
             if not rest:
                 attrvalue = None
             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                  attrvalue[:1] == '"' == attrvalue[-1:]:
                 attrvalue = attrvalue[1:-1]
             if attrvalue:
                 attrvalue = unescape(attrvalue)
             attrs.append((attrname.lower(), attrvalue))
             k = m.end()

         end = rawdata[k:endpos].strip()
         if end not in (">", "/>"):
             lineno, offset = self.getpos()
             if "\n" in self.__starttag_text:
                 lineno = lineno + self.__starttag_text.count("\n")
                 offset = len(self.__starttag_text) \
                          - self.__starttag_text.rfind("\n")
             else:
                 offset = offset + len(self.__starttag_text)
             if self.strict:
                 self.error("junk characters in start tag: %r"
                            % (rawdata[k:endpos][:20],))
             self.handle_data(rawdata[i:endpos])
             return endpos
         if end.endswith('/>'):
             # XHTML-style empty tag: <span attr="value" />
             self.handle_startendtag(tag, attrs)
         else:
             self.handle_starttag(tag, attrs)
             if tag in self.CDATA_CONTENT_ELEMENTS:
                 self.set_cdata_mode(tag)
         return endpos

     # Internal -- check to see if we have a complete starttag; return end
     # or -1 if incomplete.
     def check_for_whole_start_tag(self, i):
         rawdata = self.rawdata
         if self.strict:
             m = locatestarttagend.match(rawdata, i)
         else:
             m = locatestarttagend_tolerant.match(rawdata, i)
         if m:
             j = m.end()
             next = rawdata[j:j+1]
             if next == ">":
                 return j + 1
             if next == "/":
                 if rawdata.startswith("/>", j):
                     return j + 2
                 if rawdata.startswith("/", j):
                     # buffer boundary
                     return -1
                 # else bogus input
                 if self.strict:
                     self.updatepos(i, j + 1)
                     self.error("malformed empty start tag")
                 if j > i:
                     return j
                 else:
                     return i + 1
             if next == "":
                 # end of input
                 return -1
             if next in ("abcdefghijklmnopqrstuvwxyz=/"
                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
                 # end of input in or before attribute value, or we have the
                 # '/' from a '/>' ending
                 return -1
             if self.strict:
                 self.updatepos(i, j)
                 self.error("malformed start tag")
             if j > i:
                 return j
             else:
                 return i + 1
         raise AssertionError("we should not get here!")

     # Internal -- parse endtag, return end or -1 if incomplete
     def parse_endtag(self, i):
         rawdata = self.rawdata
         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
         match = endendtag.search(rawdata, i+1) # >
         if not match:
             return -1
         gtpos = match.end()
         match = endtagfind.match(rawdata, i) # </ + tag + >
         if not match:
             if self.cdata_elem is not None:
                 self.handle_data(rawdata[i:gtpos])
                 return gtpos
             if self.strict:
                 self.error("bad end tag: %r" % (rawdata[i:gtpos],))
             # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
             namematch = tagfind_tolerant.match(rawdata, i+2)
             if not namematch:
                 # w3.org/TR/html5/tokenization.html#end-tag-open-state
                 if rawdata[i:i+3] == '</>':
                     return i+3
                 else:
                     return self.parse_bogus_comment(i)
             tagname = namematch.group(1).lower()
             # consume and ignore other stuff between the name and the >
             # Note: this is not 100% correct, since we might have things like
             # </tag attr=">">, but looking for > after tha name should cover
             # most of the cases and is much simpler
             gtpos = rawdata.find('>', namematch.end())
             self.handle_endtag(tagname)
             return gtpos+1

         elem = match.group(1).lower() # script or style
         if self.cdata_elem is not None:
             if elem != self.cdata_elem:
                 self.handle_data(rawdata[i:gtpos])
                 return gtpos

         self.handle_endtag(elem.lower())
         self.clear_cdata_mode()
         return gtpos

     # Overridable -- finish processing of start+end tag: <tag.../>
     def handle_startendtag(self, tag, attrs):
         self.handle_starttag(tag, attrs)
         self.handle_endtag(tag)

     # Overridable -- handle start tag
     def handle_starttag(self, tag, attrs):
         pass

     # Overridable -- handle end tag
     def handle_endtag(self, tag):
         pass

     # Overridable -- handle character reference
     def handle_charref(self, name):
         pass

     # Overridable -- handle entity reference
     def handle_entityref(self, name):
         pass

     # Overridable -- handle data
     def handle_data(self, data):
         pass

     # Overridable -- handle comment
     def handle_comment(self, data):
         pass

     # Overridable -- handle declaration
     def handle_decl(self, decl):
         pass

     # Overridable -- handle processing instruction
     def handle_pi(self, data):
         pass

     def unknown_decl(self, data):
         if self.strict:
             self.error("unknown declaration: %r" % (data,))

     # Internal -- helper to remove special character quoting
     def unescape(self, s):
         warnings.warn('The unescape method is deprecated and will be removed '
                       'in 3.5, use html.unescape() instead.',
                       DeprecationWarning, stacklevel=2)
         return unescape(s)
	"""A parser for HTML and XHTML."""

	# This file is based on sgmllib.py, but the API is slightly different.

	# XXX There should be a way to distinguish between PCDATA (parsed
	# character data -- the normal case), RCDATA (replaceable character
	# data -- only char and entity references and end tags are special)
	# and CDATA (character data -- only end tags are special).


	import re
	import warnings
	import _markupbase

	from html import unescape


	__all__ = ['HTMLParser']

	# Regular expressions used for parsing

	interesting_normal = re.compile('[&<]')
	incomplete = re.compile('&[a-zA-Z#]')

	entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
	charref = re.compile('&#(?:[0-9]+\|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')

	starttagopen = re.compile('<[a-zA-Z]')
	piclose = re.compile('>')
	commentclose = re.compile(r'--\s*>')
	# Note:
	# 1) the strict attrfind isn't really strict, but we can't make it
	# correctly strict without breaking backward compatibility;
	# 2) if you change tagfind/attrfind remember to update locatestarttagend too;
	# 3) if you change tagfind/attrfind and/or locatestarttagend the parser will
	# explode, so don't do it.
	tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_])(?:\s\|/(?!>))')
	# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
	# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
	tagfind_tolerant = re.compile('([a-zA-Z][^\t\n\r\f />\x00])(?:\s\|/(?!>))')
	attrfind = re.compile(
	r'\s([a-zA-Z_][-.:a-zA-Z_0-9])(\s=\s'
	r'(\'[^\']\'\|"[^"]"\|[^\s"\'=<>`]*))?')
	attrfind_tolerant = re.compile(
	r'((?<=[\'"\s/])[^\s/>][^\s/=>])(\s=+\s*'
	r'(\'[^\']\'\|"[^"]"\|(?![\'"])[^>\s]))?(?:\s\|/(?!>))')
	locatestarttagend = re.compile(r"""
	<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
	(?:\s+ # whitespace before attribute name
	(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
	(?:\s=\s # value indicator
	(?:'[^']*' # LITA-enclosed value
	\|\"[^\"]*\" # LIT-enclosed value
	\|[^'\">\s]+ # bare value
	)
	)?
	)
	)*
	\s* # trailing whitespace
	""", re.VERBOSE)
	locatestarttagend_tolerant = re.compile(r"""
	<[a-zA-Z][^\t\n\r\f />\x00]* # tag name
	(?:[\s/]* # optional whitespace before attribute name
	(?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
	(?:\s=+\s # value indicator
	(?:'[^']*' # LITA-enclosed value
	\|"[^"]*" # LIT-enclosed value
	\|(?!['"])[^>\s]* # bare value
	)
	(?:\s,) # possibly followed by a comma
	)?(?:\s\|/(?!>))*
	)*
	)?
	\s* # trailing whitespace
	""", re.VERBOSE)
	endendtag = re.compile('>')
	# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
	# </ and the tag name, so maybe this should be fixed
	endtagfind = re.compile('</\s([a-zA-Z][-.a-zA-Z0-9:_])\s*>')


	class HTMLParseError(Exception):
	"""Exception raised for all parse errors."""

	def __init__(self, msg, position=(None, None)):
	assert msg
	self.msg = msg
	self.lineno = position[0]
	self.offset = position[1]

	def __str__(self):
	result = self.msg
	if self.lineno is not None:
	result = result + ", at line %d" % self.lineno
	if self.offset is not None:
	result = result + ", column %d" % (self.offset + 1)
	return result


	_default_sentinel = object()

	class HTMLParser(_markupbase.ParserBase):
	"""Find tags and other markup and call handler functions.

	Usage:
	p = HTMLParser()
	p.feed(data)
	...
	p.close()

	Start tags are handled by calling self.handle_starttag() or
	self.handle_startendtag(); end tags by self.handle_endtag(). The
	data between tags is passed from the parser to the derived class
	by calling self.handle_data() with the data as argument (the data
	may be split up in arbitrary chunks). If convert_charrefs is
	True the character references are converted automatically to the
	corresponding Unicode character (and self.handle_data() is no
	longer split in chunks), otherwise they are passed by calling
	self.handle_entityref() or self.handle_charref() with the string
	containing respectively the named or numeric reference as the
	argument.
	"""

	CDATA_CONTENT_ELEMENTS = ("script", "style")

	def __init__(self, strict=_default_sentinel, *,
	convert_charrefs=_default_sentinel):
	"""Initialize and reset this instance.

	If convert_charrefs is True (default: False), all character references
	are automatically converted to the corresponding Unicode characters.
	If strict is set to False (the default) the parser will parse invalid
	markup, otherwise it will raise an error. Note that the strict mode
	and argument are deprecated.
	"""
	if strict is not _default_sentinel:
	warnings.warn("The strict argument and mode are deprecated.",
	DeprecationWarning, stacklevel=2)
	else:
	strict = False # default
	self.strict = strict
	if convert_charrefs is _default_sentinel:
	convert_charrefs = False # default
	warnings.warn("The value of convert_charrefs will become True in "
	"3.5. You are encouraged to set the value explicitly.",
	DeprecationWarning, stacklevel=2)
	self.convert_charrefs = convert_charrefs
	self.reset()

	def reset(self):
	"""Reset this instance. Loses all unprocessed data."""
	self.rawdata = ''
	self.lasttag = '???'
	self.interesting = interesting_normal
	self.cdata_elem = None
	_markupbase.ParserBase.reset(self)

	def feed(self, data):
	r"""Feed data to the parser.

	Call this as often as you want, with as little or as much text
	as you want (may include '\n').
	"""
	self.rawdata = self.rawdata + data
	self.goahead(0)

	def close(self):
	"""Handle any buffered data."""
	self.goahead(1)

	def error(self, message):
	warnings.warn("The 'error' method is deprecated.",
	DeprecationWarning, stacklevel=2)
	raise HTMLParseError(message, self.getpos())

	__starttag_text = None

	def get_starttag_text(self):
	"""Return full source of start tag: '<...>'."""
	return self.__starttag_text

	def set_cdata_mode(self, elem):
	self.cdata_elem = elem.lower()
	self.interesting = re.compile(r'</\s%s\s>' % self.cdata_elem, re.I)

	def clear_cdata_mode(self):
	self.interesting = interesting_normal
	self.cdata_elem = None

	# Internal -- handle data as far as reasonable. May leave state
	# and data to be processed by a subsequent call. If 'end' is
	# true, force handling all data as if followed by EOF marker.
	def goahead(self, end):
	rawdata = self.rawdata
	i = 0
	n = len(rawdata)
	while i < n:
	if self.convert_charrefs and not self.cdata_elem:
	j = rawdata.find('<', i)
	if j < 0:
	if not end:
	break # wait till we get all the text
	j = n
	else:
	match = self.interesting.search(rawdata, i) # < or &
	if match:
	j = match.start()
	else:
	if self.cdata_elem:
	break
	j = n
	if i < j:
	if self.convert_charrefs and not self.cdata_elem:
	self.handle_data(unescape(rawdata[i:j]))
	else:
	self.handle_data(rawdata[i:j])
	i = self.updatepos(i, j)
	if i == n: break
	startswith = rawdata.startswith
	if startswith('<', i):
	if starttagopen.match(rawdata, i): # < + letter
	k = self.parse_starttag(i)
	elif startswith("</", i):
	k = self.parse_endtag(i)
	elif startswith("<!--", i):
	k = self.parse_comment(i)
	elif startswith("<?", i):
	k = self.parse_pi(i)
	elif startswith("<!", i):
	if self.strict:
	k = self.parse_declaration(i)
	else:
	k = self.parse_html_declaration(i)
	elif (i + 1) < n:
	self.handle_data("<")
	k = i + 1
	else:
	break
	if k < 0:
	if not end:
	break
	if self.strict:
	self.error("EOF in middle of construct")
	k = rawdata.find('>', i + 1)
	if k < 0:
	k = rawdata.find('<', i + 1)
	if k < 0:
	k = i + 1
	else:
	k += 1
	if self.convert_charrefs and not self.cdata_elem:
	self.handle_data(unescape(rawdata[i:k]))
	else:
	self.handle_data(rawdata[i:k])
	i = self.updatepos(i, k)
	elif startswith("&#", i):
	match = charref.match(rawdata, i)
	if match:
	name = match.group()[2:-1]
	self.handle_charref(name)
	k = match.end()
	if not startswith(';', k-1):
	k = k - 1
	i = self.updatepos(i, k)
	continue
	else:
	if ";" in rawdata[i:]: # bail by consuming &#
	self.handle_data(rawdata[i:i+2])
	i = self.updatepos(i, i+2)
	break
	elif startswith('&', i):
	match = entityref.match(rawdata, i)
	if match:
	name = match.group(1)
	self.handle_entityref(name)
	k = match.end()
	if not startswith(';', k-1):
	k = k - 1
	i = self.updatepos(i, k)
	continue
	match = incomplete.match(rawdata, i)
	if match:
	# match.group() will contain at least 2 chars
	if end and match.group() == rawdata[i:]:
	if self.strict:
	self.error("EOF in middle of entity or char ref")
	else:
	k = match.end()
	if k <= i:
	k = n
	i = self.updatepos(i, i + 1)
	# incomplete
	break
	elif (i + 1) < n:
	# not the end of the buffer, and can't be confused
	# with some other construct
	self.handle_data("&")
	i = self.updatepos(i, i + 1)
	else:
	break
	else:
	assert 0, "interesting.search() lied"
	# end while
	if end and i < n and not self.cdata_elem:
	if self.convert_charrefs and not self.cdata_elem:
	self.handle_data(unescape(rawdata[i:n]))
	else:
	self.handle_data(rawdata[i:n])
	i = self.updatepos(i, n)
	self.rawdata = rawdata[i:]

	# Internal -- parse html declarations, return length or -1 if not terminated
	# See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
	# See also parse_declaration in _markupbase
	def parse_html_declaration(self, i):
	rawdata = self.rawdata
	assert rawdata[i:i+2] == '<!', ('unexpected call to '
	'parse_html_declaration()')
	if rawdata[i:i+4] == '<!--':
	# this case is actually already handled in goahead()
	return self.parse_comment(i)
	elif rawdata[i:i+3] == '<![':
	return self.parse_marked_section(i)
	elif rawdata[i:i+9].lower() == '<!doctype':
	# find the closing >
	gtpos = rawdata.find('>', i+9)
	if gtpos == -1:
	return -1
	self.handle_decl(rawdata[i+2:gtpos])
	return gtpos+1
	else:
	return self.parse_bogus_comment(i)

	# Internal -- parse bogus comment, return length or -1 if not terminated
	# see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
	def parse_bogus_comment(self, i, report=1):
	rawdata = self.rawdata
	assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
	'parse_comment()')
	pos = rawdata.find('>', i+2)
	if pos == -1:
	return -1
	if report:
	self.handle_comment(rawdata[i+2:pos])
	return pos + 1

	# Internal -- parse processing instr, return end or -1 if not terminated
	def parse_pi(self, i):
	rawdata = self.rawdata
	assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
	match = piclose.search(rawdata, i+2) # >
	if not match:
	return -1
	j = match.start()
	self.handle_pi(rawdata[i+2: j])
	j = match.end()
	return j

	# Internal -- handle starttag, return end or -1 if not terminated
	def parse_starttag(self, i):
	self.__starttag_text = None
	endpos = self.check_for_whole_start_tag(i)
	if endpos < 0:
	return endpos
	rawdata = self.rawdata
	self.__starttag_text = rawdata[i:endpos]

	# Now parse the data between i+1 and j into a tag and attrs
	attrs = []
	if self.strict:
	match = tagfind.match(rawdata, i+1)
	else:
	match = tagfind_tolerant.match(rawdata, i+1)
	assert match, 'unexpected call to parse_starttag()'
	k = match.end()
	self.lasttag = tag = match.group(1).lower()
	while k < endpos:
	if self.strict:
	m = attrfind.match(rawdata, k)
	else:
	m = attrfind_tolerant.match(rawdata, k)
	if not m:
	break
	attrname, rest, attrvalue = m.group(1, 2, 3)
	if not rest:
	attrvalue = None
	elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
	attrvalue[:1] == '"' == attrvalue[-1:]:
	attrvalue = attrvalue[1:-1]
	if attrvalue:
	attrvalue = unescape(attrvalue)
	attrs.append((attrname.lower(), attrvalue))
	k = m.end()

	end = rawdata[k:endpos].strip()
	if end not in (">", "/>"):
	lineno, offset = self.getpos()
	if "\n" in self.__starttag_text:
	lineno = lineno + self.__starttag_text.count("\n")
	offset = len(self.__starttag_text) \
	- self.__starttag_text.rfind("\n")
	else:
	offset = offset + len(self.__starttag_text)
	if self.strict:
	self.error("junk characters in start tag: %r"
	% (rawdata[k:endpos][:20],))
	self.handle_data(rawdata[i:endpos])
	return endpos
	if end.endswith('/>'):
	# XHTML-style empty tag: <span attr="value" />
	self.handle_startendtag(tag, attrs)
	else:
	self.handle_starttag(tag, attrs)
	if tag in self.CDATA_CONTENT_ELEMENTS:
	self.set_cdata_mode(tag)
	return endpos

	# Internal -- check to see if we have a complete starttag; return end
	# or -1 if incomplete.
	def check_for_whole_start_tag(self, i):
	rawdata = self.rawdata
	if self.strict:
	m = locatestarttagend.match(rawdata, i)
	else:
	m = locatestarttagend_tolerant.match(rawdata, i)
	if m:
	j = m.end()
	next = rawdata[j:j+1]
	if next == ">":
	return j + 1
	if next == "/":
	if rawdata.startswith("/>", j):
	return j + 2
	if rawdata.startswith("/", j):
	# buffer boundary
	return -1
	# else bogus input
	if self.strict:
	self.updatepos(i, j + 1)
	self.error("malformed empty start tag")
	if j > i:
	return j
	else:
	return i + 1
	if next == "":
	# end of input
	return -1
	if next in ("abcdefghijklmnopqrstuvwxyz=/"
	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
	# end of input in or before attribute value, or we have the
	# '/' from a '/>' ending
	return -1
	if self.strict:
	self.updatepos(i, j)
	self.error("malformed start tag")
	if j > i:
	return j
	else:
	return i + 1
	raise AssertionError("we should not get here!")

	# Internal -- parse endtag, return end or -1 if incomplete
	def parse_endtag(self, i):
	rawdata = self.rawdata
	assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
	match = endendtag.search(rawdata, i+1) # >
	if not match:
	return -1
	gtpos = match.end()
	match = endtagfind.match(rawdata, i) # </ + tag + >
	if not match:
	if self.cdata_elem is not None:
	self.handle_data(rawdata[i:gtpos])
	return gtpos
	if self.strict:
	self.error("bad end tag: %r" % (rawdata[i:gtpos],))
	# find the name: w3.org/TR/html5/tokenization.html#tag-name-state
	namematch = tagfind_tolerant.match(rawdata, i+2)
	if not namematch:
	# w3.org/TR/html5/tokenization.html#end-tag-open-state
	if rawdata[i:i+3] == '</>':
	return i+3
	else:
	return self.parse_bogus_comment(i)
	tagname = namematch.group(1).lower()
	# consume and ignore other stuff between the name and the >
	# Note: this is not 100% correct, since we might have things like
	# </tag attr=">">, but looking for > after tha name should cover
	# most of the cases and is much simpler
	gtpos = rawdata.find('>', namematch.end())
	self.handle_endtag(tagname)
	return gtpos+1

	elem = match.group(1).lower() # script or style
	if self.cdata_elem is not None:
	if elem != self.cdata_elem:
	self.handle_data(rawdata[i:gtpos])
	return gtpos

	self.handle_endtag(elem.lower())
	self.clear_cdata_mode()
	return gtpos

	# Overridable -- finish processing of start+end tag: <tag.../>
	def handle_startendtag(self, tag, attrs):
	self.handle_starttag(tag, attrs)
	self.handle_endtag(tag)

	# Overridable -- handle start tag
	def handle_starttag(self, tag, attrs):
	pass

	# Overridable -- handle end tag
	def handle_endtag(self, tag):
	pass

	# Overridable -- handle character reference
	def handle_charref(self, name):
	pass

	# Overridable -- handle entity reference
	def handle_entityref(self, name):
	pass

	# Overridable -- handle data
	def handle_data(self, data):
	pass

	# Overridable -- handle comment
	def handle_comment(self, data):
	pass

	# Overridable -- handle declaration
	def handle_decl(self, decl):
	pass

	# Overridable -- handle processing instruction
	def handle_pi(self, data):
	pass

	def unknown_decl(self, data):
	if self.strict:
	self.error("unknown declaration: %r" % (data,))

	# Internal -- helper to remove special character quoting
	def unescape(self, s):
	warnings.warn('The unescape method is deprecated and will be removed '
	'in 3.5, use html.unescape() instead.',
	DeprecationWarning, stacklevel=2)
	return unescape(s)