| """Shared support for scanning document type declarations in HTML and XHTML.""" |
| |
| import re |
| import string |
| |
| _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match |
| _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match |
| |
| del re |
| |
| |
| class ParserBase: |
| """Parser base class which provides some common support methods used |
| by the SGML/HTML and XHTML parsers.""" |
| |
| def __init__(self): |
| if self.__class__ is ParserBase: |
| raise RuntimeError( |
| "markupbase.ParserBase must be subclassed") |
| |
| def error(self, message): |
| raise NotImplementedError( |
| "subclasses of ParserBase must override error()") |
| |
| def reset(self): |
| self.lineno = 1 |
| self.offset = 0 |
| |
| def getpos(self): |
| """Return current line number and offset.""" |
| return self.lineno, self.offset |
| |
| # Internal -- update line number and offset. This should be |
| # called for each piece of data exactly once, in order -- in other |
| # words the concatenation of all the input strings to this |
| # function should be exactly the entire input. |
| def updatepos(self, i, j): |
| if i >= j: |
| return j |
| rawdata = self.rawdata |
| nlines = string.count(rawdata, "\n", i, j) |
| if nlines: |
| self.lineno = self.lineno + nlines |
| pos = string.rindex(rawdata, "\n", i, j) # Should not fail |
| self.offset = j-(pos+1) |
| else: |
| self.offset = self.offset + j-i |
| return j |
| |
| _decl_otherchars = '' |
| |
| # Internal -- parse declaration (for use by subclasses). |
| def parse_declaration(self, i): |
| # This is some sort of declaration; in "HTML as |
| # deployed," this should only be the document type |
| # declaration ("<!DOCTYPE html...>"). |
| rawdata = self.rawdata |
| j = i + 2 |
| assert rawdata[i:j] == "<!", "unexpected call to parse_declaration" |
| if rawdata[j:j+1] in ("-", ""): |
| # Start of comment followed by buffer boundary, |
| # or just a buffer boundary. |
| return -1 |
| # in practice, this should look like: ((name|stringlit) S*)+ '>' |
| n = len(rawdata) |
| decltype, j = self._scan_name(j, i) |
| if j < 0: |
| return j |
| if decltype == "doctype": |
| self._decl_otherchars = '' |
| while j < n: |
| c = rawdata[j] |
| if c == ">": |
| # end of declaration syntax |
| data = rawdata[i+2:j] |
| if decltype == "doctype": |
| self.handle_decl(data) |
| else: |
| self.unknown_decl(data) |
| return j + 1 |
| if c in "\"'": |
| m = _declstringlit_match(rawdata, j) |
| if not m: |
| return -1 # incomplete |
| j = m.end() |
| elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": |
| name, j = self._scan_name(j, i) |
| elif c in self._decl_otherchars: |
| j = j + 1 |
| elif c == "[": |
| if decltype == "doctype": |
| j = self._parse_doctype_subset(j + 1, i) |
| else: |
| self.error("unexpected '[' char in declaration") |
| else: |
| self.error( |
| "unexpected %s char in declaration" % `rawdata[j]`) |
| if j < 0: |
| return j |
| return -1 # incomplete |
| |
| # Internal -- scan past the internal subset in a <!DOCTYPE declaration, |
| # returning the index just past any whitespace following the trailing ']'. |
| def _parse_doctype_subset(self, i, declstartpos): |
| rawdata = self.rawdata |
| n = len(rawdata) |
| j = i |
| while j < n: |
| c = rawdata[j] |
| if c == "<": |
| s = rawdata[j:j+2] |
| if s == "<": |
| # end of buffer; incomplete |
| return -1 |
| if s != "<!": |
| self.updatepos(declstartpos, j + 1) |
| self.error("unexpected char in internal subset (in %s)" |
| % `s`) |
| if (j + 2) == n: |
| # end of buffer; incomplete |
| return -1 |
| if (j + 4) > n: |
| # end of buffer; incomplete |
| return -1 |
| if rawdata[j:j+4] == "<!--": |
| j = self.parse_comment(j, report=0) |
| if j < 0: |
| return j |
| continue |
| name, j = self._scan_name(j + 2, declstartpos) |
| if j == -1: |
| return -1 |
| if name not in ("attlist", "element", "entity", "notation"): |
| self.updatepos(declstartpos, j + 2) |
| self.error( |
| "unknown declaration %s in internal subset" % `name`) |
| # handle the individual names |
| meth = getattr(self, "_parse_doctype_" + name) |
| j = meth(j, declstartpos) |
| if j < 0: |
| return j |
| elif c == "%": |
| # parameter entity reference |
| if (j + 1) == n: |
| # end of buffer; incomplete |
| return -1 |
| s, j = self._scan_name(j + 1, declstartpos) |
| if j < 0: |
| return j |
| if rawdata[j] == ";": |
| j = j + 1 |
| elif c == "]": |
| j = j + 1 |
| while j < n and rawdata[j] in string.whitespace: |
| j = j + 1 |
| if j < n: |
| if rawdata[j] == ">": |
| return j |
| self.updatepos(declstartpos, j) |
| self.error("unexpected char after internal subset") |
| else: |
| return -1 |
| elif c in string.whitespace: |
| j = j + 1 |
| else: |
| self.updatepos(declstartpos, j) |
| self.error("unexpected char %s in internal subset" % `c`) |
| # end of buffer reached |
| return -1 |
| |
| # Internal -- scan past <!ELEMENT declarations |
| def _parse_doctype_element(self, i, declstartpos): |
| name, j = self._scan_name(i, declstartpos) |
| if j == -1: |
| return -1 |
| # style content model; just skip until '>' |
| rawdata = self.rawdata |
| if '>' in rawdata[j:]: |
| return string.find(rawdata, ">", j) + 1 |
| return -1 |
| |
| # Internal -- scan past <!ATTLIST declarations |
| def _parse_doctype_attlist(self, i, declstartpos): |
| rawdata = self.rawdata |
| name, j = self._scan_name(i, declstartpos) |
| c = rawdata[j:j+1] |
| if c == "": |
| return -1 |
| if c == ">": |
| return j + 1 |
| while 1: |
| # scan a series of attribute descriptions; simplified: |
| # name type [value] [#constraint] |
| name, j = self._scan_name(j, declstartpos) |
| if j < 0: |
| return j |
| c = rawdata[j:j+1] |
| if c == "": |
| return -1 |
| if c == "(": |
| # an enumerated type; look for ')' |
| if ")" in rawdata[j:]: |
| j = string.find(rawdata, ")", j) + 1 |
| else: |
| return -1 |
| while rawdata[j:j+1] in string.whitespace: |
| j = j + 1 |
| if not rawdata[j:]: |
| # end of buffer, incomplete |
| return -1 |
| else: |
| name, j = self._scan_name(j, declstartpos) |
| c = rawdata[j:j+1] |
| if not c: |
| return -1 |
| if c in "'\"": |
| m = _declstringlit_match(rawdata, j) |
| if m: |
| j = m.end() |
| else: |
| return -1 |
| c = rawdata[j:j+1] |
| if not c: |
| return -1 |
| if c == "#": |
| if rawdata[j:] == "#": |
| # end of buffer |
| return -1 |
| name, j = self._scan_name(j + 1, declstartpos) |
| if j < 0: |
| return j |
| c = rawdata[j:j+1] |
| if not c: |
| return -1 |
| if c == '>': |
| # all done |
| return j + 1 |
| |
| # Internal -- scan past <!NOTATION declarations |
| def _parse_doctype_notation(self, i, declstartpos): |
| name, j = self._scan_name(i, declstartpos) |
| if j < 0: |
| return j |
| rawdata = self.rawdata |
| while 1: |
| c = rawdata[j:j+1] |
| if not c: |
| # end of buffer; incomplete |
| return -1 |
| if c == '>': |
| return j + 1 |
| if c in "'\"": |
| m = _declstringlit_match(rawdata, j) |
| if not m: |
| return -1 |
| j = m.end() |
| else: |
| name, j = self._scan_name(j, declstartpos) |
| if j < 0: |
| return j |
| |
| # Internal -- scan past <!ENTITY declarations |
| def _parse_doctype_entity(self, i, declstartpos): |
| rawdata = self.rawdata |
| if rawdata[i:i+1] == "%": |
| j = i + 1 |
| while 1: |
| c = rawdata[j:j+1] |
| if not c: |
| return -1 |
| if c in string.whitespace: |
| j = j + 1 |
| else: |
| break |
| else: |
| j = i |
| name, j = self._scan_name(j, declstartpos) |
| if j < 0: |
| return j |
| while 1: |
| c = self.rawdata[j:j+1] |
| if not c: |
| return -1 |
| if c in "'\"": |
| m = _declstringlit_match(rawdata, j) |
| if m: |
| j = m.end() |
| else: |
| return -1 # incomplete |
| elif c == ">": |
| return j + 1 |
| else: |
| name, j = self._scan_name(j, declstartpos) |
| if j < 0: |
| return j |
| |
| # Internal -- scan a name token and the new position and the token, or |
| # return -1 if we've reached the end of the buffer. |
| def _scan_name(self, i, declstartpos): |
| rawdata = self.rawdata |
| n = len(rawdata) |
| if i == n: |
| return None, -1 |
| m = _declname_match(rawdata, i) |
| if m: |
| s = m.group() |
| name = s.strip() |
| if (i + len(s)) == n: |
| return None, -1 # end of buffer |
| return string.lower(name), m.end() |
| else: |
| self.updatepos(declstartpos, i) |
| self.error("expected name token") |
| |
| # To be overridden -- handlers for unknown objects |
| def unknown_decl(self, data): |
| pass |