| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 1 | """A parser for HTML and XHTML.""" | 
 | 2 |  | 
 | 3 | # This file is based on sgmllib.py, but the API is slightly different. | 
 | 4 |  | 
 | 5 | # XXX There should be a way to distinguish between PCDATA (parsed | 
 | 6 | # character data -- the normal case), RCDATA (replaceable character | 
 | 7 | # data -- only char and entity references and end tags are special) | 
 | 8 | # and CDATA (character data -- only end tags are special). | 
 | 9 |  | 
 | 10 |  | 
 | 11 | import markupbase | 
 | 12 | import re | 
 | 13 |  | 
 | 14 | # Regular expressions used for parsing | 
 | 15 |  | 
 | 16 | interesting_normal = re.compile('[&<]') | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 17 | incomplete = re.compile('&[a-zA-Z#]') | 
 | 18 |  | 
 | 19 | entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') | 
 | 20 | charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') | 
 | 21 |  | 
 | 22 | starttagopen = re.compile('<[a-zA-Z]') | 
 | 23 | piclose = re.compile('>') | 
 | 24 | commentclose = re.compile(r'--\s*>') | 
 | 25 | tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') | 
| Ezio Melotti | f117443 | 2012-02-13 16:28:54 +0200 | [diff] [blame] | 26 | # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state | 
 | 27 | # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state | 
 | 28 | tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') | 
| Ezio Melotti | 0f1571c | 2011-11-14 18:04:05 +0200 | [diff] [blame] | 29 |  | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 30 | attrfind = re.compile( | 
| Ezio Melotti | 0f1571c | 2011-11-14 18:04:05 +0200 | [diff] [blame] | 31 |     r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' | 
 | 32 |     r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 33 |  | 
 | 34 | locatestarttagend = re.compile(r""" | 
 | 35 |   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name | 
 | 36 |   (?:\s+                             # whitespace before attribute name | 
| Ezio Melotti | 0f1571c | 2011-11-14 18:04:05 +0200 | [diff] [blame] | 37 |     (?:(?<=['"\s])[^\s/>][^\s/=>]*   # attribute name | 
 | 38 |       (?:\s*=+\s*                    # value indicator | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 39 |         (?:'[^']*'                   # LITA-enclosed value | 
| Ezio Melotti | 0f1571c | 2011-11-14 18:04:05 +0200 | [diff] [blame] | 40 |           |"[^"]*"                   # LIT-enclosed value | 
 | 41 |           |(?!['"])[^>\s]*           # bare value | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 42 |          ) | 
| Ezio Melotti | 0f1571c | 2011-11-14 18:04:05 +0200 | [diff] [blame] | 43 |        )?\s* | 
 | 44 |      )* | 
 | 45 |    )? | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 46 |   \s*                                # trailing whitespace | 
 | 47 | """, re.VERBOSE) | 
 | 48 | endendtag = re.compile('>') | 
| Ezio Melotti | 7e82b27 | 2011-11-01 14:09:56 +0200 | [diff] [blame] | 49 | # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between | 
 | 50 | # </ and the tag name, so maybe this should be fixed | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 51 | endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') | 
 | 52 |  | 
 | 53 |  | 
 | 54 | class HTMLParseError(Exception): | 
 | 55 |     """Exception raised for all parse errors.""" | 
 | 56 |  | 
 | 57 |     def __init__(self, msg, position=(None, None)): | 
 | 58 |         assert msg | 
 | 59 |         self.msg = msg | 
 | 60 |         self.lineno = position[0] | 
 | 61 |         self.offset = position[1] | 
 | 62 |  | 
 | 63 |     def __str__(self): | 
 | 64 |         result = self.msg | 
 | 65 |         if self.lineno is not None: | 
 | 66 |             result = result + ", at line %d" % self.lineno | 
 | 67 |         if self.offset is not None: | 
 | 68 |             result = result + ", column %d" % (self.offset + 1) | 
 | 69 |         return result | 
 | 70 |  | 
 | 71 |  | 
 | 72 | class HTMLParser(markupbase.ParserBase): | 
 | 73 |     """Find tags and other markup and call handler functions. | 
 | 74 |  | 
 | 75 |     Usage: | 
 | 76 |         p = HTMLParser() | 
 | 77 |         p.feed(data) | 
 | 78 |         ... | 
 | 79 |         p.close() | 
 | 80 |  | 
 | 81 |     Start tags are handled by calling self.handle_starttag() or | 
 | 82 |     self.handle_startendtag(); end tags by self.handle_endtag().  The | 
 | 83 |     data between tags is passed from the parser to the derived class | 
 | 84 |     by calling self.handle_data() with the data as argument (the data | 
 | 85 |     may be split up in arbitrary chunks).  Entity references are | 
 | 86 |     passed by calling self.handle_entityref() with the entity | 
 | 87 |     reference as the argument.  Numeric character references are | 
 | 88 |     passed to self.handle_charref() with the string containing the | 
 | 89 |     reference as the argument. | 
 | 90 |     """ | 
 | 91 |  | 
 | 92 |     CDATA_CONTENT_ELEMENTS = ("script", "style") | 
 | 93 |  | 
 | 94 |  | 
 | 95 |     def __init__(self): | 
 | 96 |         """Initialize and reset this instance.""" | 
 | 97 |         self.reset() | 
 | 98 |  | 
 | 99 |     def reset(self): | 
 | 100 |         """Reset this instance.  Loses all unprocessed data.""" | 
 | 101 |         self.rawdata = '' | 
 | 102 |         self.lasttag = '???' | 
 | 103 |         self.interesting = interesting_normal | 
| Ezio Melotti | 7e82b27 | 2011-11-01 14:09:56 +0200 | [diff] [blame] | 104 |         self.cdata_elem = None | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 105 |         markupbase.ParserBase.reset(self) | 
 | 106 |  | 
 | 107 |     def feed(self, data): | 
| Éric Araujo | 31890bc | 2011-05-25 18:11:43 +0200 | [diff] [blame] | 108 |         r"""Feed data to the parser. | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 109 |  | 
 | 110 |         Call this as often as you want, with as little or as much text | 
 | 111 |         as you want (may include '\n'). | 
 | 112 |         """ | 
 | 113 |         self.rawdata = self.rawdata + data | 
 | 114 |         self.goahead(0) | 
 | 115 |  | 
 | 116 |     def close(self): | 
 | 117 |         """Handle any buffered data.""" | 
 | 118 |         self.goahead(1) | 
 | 119 |  | 
 | 120 |     def error(self, message): | 
 | 121 |         raise HTMLParseError(message, self.getpos()) | 
 | 122 |  | 
 | 123 |     __starttag_text = None | 
 | 124 |  | 
 | 125 |     def get_starttag_text(self): | 
 | 126 |         """Return full source of start tag: '<...>'.""" | 
 | 127 |         return self.__starttag_text | 
 | 128 |  | 
| Ezio Melotti | 7e82b27 | 2011-11-01 14:09:56 +0200 | [diff] [blame] | 129 |     def set_cdata_mode(self, elem): | 
| Ezio Melotti | 7e82b27 | 2011-11-01 14:09:56 +0200 | [diff] [blame] | 130 |         self.cdata_elem = elem.lower() | 
| Ezio Melotti | 00dc60b | 2011-11-18 18:00:40 +0200 | [diff] [blame] | 131 |         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 132 |  | 
 | 133 |     def clear_cdata_mode(self): | 
 | 134 |         self.interesting = interesting_normal | 
| Ezio Melotti | 7e82b27 | 2011-11-01 14:09:56 +0200 | [diff] [blame] | 135 |         self.cdata_elem = None | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 136 |  | 
 | 137 |     # Internal -- handle data as far as reasonable.  May leave state | 
 | 138 |     # and data to be processed by a subsequent call.  If 'end' is | 
 | 139 |     # true, force handling all data as if followed by EOF marker. | 
 | 140 |     def goahead(self, end): | 
 | 141 |         rawdata = self.rawdata | 
 | 142 |         i = 0 | 
 | 143 |         n = len(rawdata) | 
 | 144 |         while i < n: | 
 | 145 |             match = self.interesting.search(rawdata, i) # < or & | 
 | 146 |             if match: | 
 | 147 |                 j = match.start() | 
 | 148 |             else: | 
| Ezio Melotti | 00dc60b | 2011-11-18 18:00:40 +0200 | [diff] [blame] | 149 |                 if self.cdata_elem: | 
 | 150 |                     break | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 151 |                 j = n | 
 | 152 |             if i < j: self.handle_data(rawdata[i:j]) | 
 | 153 |             i = self.updatepos(i, j) | 
 | 154 |             if i == n: break | 
 | 155 |             startswith = rawdata.startswith | 
 | 156 |             if startswith('<', i): | 
 | 157 |                 if starttagopen.match(rawdata, i): # < + letter | 
 | 158 |                     k = self.parse_starttag(i) | 
 | 159 |                 elif startswith("</", i): | 
 | 160 |                     k = self.parse_endtag(i) | 
 | 161 |                 elif startswith("<!--", i): | 
 | 162 |                     k = self.parse_comment(i) | 
 | 163 |                 elif startswith("<?", i): | 
 | 164 |                     k = self.parse_pi(i) | 
 | 165 |                 elif startswith("<!", i): | 
| Ezio Melotti | 4b92cc3 | 2012-02-13 16:10:44 +0200 | [diff] [blame] | 166 |                     k = self.parse_html_declaration(i) | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 167 |                 elif (i + 1) < n: | 
 | 168 |                     self.handle_data("<") | 
 | 169 |                     k = i + 1 | 
 | 170 |                 else: | 
 | 171 |                     break | 
 | 172 |                 if k < 0: | 
| Ezio Melotti | d2307cb | 2012-02-15 12:44:23 +0200 | [diff] [blame^] | 173 |                     if not end: | 
 | 174 |                         break | 
 | 175 |                     k = rawdata.find('>', i + 1) | 
 | 176 |                     if k < 0: | 
 | 177 |                         k = rawdata.find('<', i + 1) | 
 | 178 |                         if k < 0: | 
 | 179 |                             k = i + 1 | 
 | 180 |                     else: | 
 | 181 |                         k += 1 | 
 | 182 |                     self.handle_data(rawdata[i:k]) | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 183 |                 i = self.updatepos(i, k) | 
 | 184 |             elif startswith("&#", i): | 
 | 185 |                 match = charref.match(rawdata, i) | 
 | 186 |                 if match: | 
 | 187 |                     name = match.group()[2:-1] | 
 | 188 |                     self.handle_charref(name) | 
 | 189 |                     k = match.end() | 
 | 190 |                     if not startswith(';', k-1): | 
 | 191 |                         k = k - 1 | 
 | 192 |                     i = self.updatepos(i, k) | 
 | 193 |                     continue | 
 | 194 |                 else: | 
| Victor Stinner | 554a3b8 | 2010-05-24 21:33:24 +0000 | [diff] [blame] | 195 |                     if ";" in rawdata[i:]: #bail by consuming &# | 
 | 196 |                         self.handle_data(rawdata[0:2]) | 
 | 197 |                         i = self.updatepos(i, 2) | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 198 |                     break | 
 | 199 |             elif startswith('&', i): | 
 | 200 |                 match = entityref.match(rawdata, i) | 
 | 201 |                 if match: | 
 | 202 |                     name = match.group(1) | 
 | 203 |                     self.handle_entityref(name) | 
 | 204 |                     k = match.end() | 
 | 205 |                     if not startswith(';', k-1): | 
 | 206 |                         k = k - 1 | 
 | 207 |                     i = self.updatepos(i, k) | 
 | 208 |                     continue | 
 | 209 |                 match = incomplete.match(rawdata, i) | 
 | 210 |                 if match: | 
 | 211 |                     # match.group() will contain at least 2 chars | 
 | 212 |                     if end and match.group() == rawdata[i:]: | 
 | 213 |                         self.error("EOF in middle of entity or char ref") | 
 | 214 |                     # incomplete | 
 | 215 |                     break | 
 | 216 |                 elif (i + 1) < n: | 
 | 217 |                     # not the end of the buffer, and can't be confused | 
 | 218 |                     # with some other construct | 
 | 219 |                     self.handle_data("&") | 
 | 220 |                     i = self.updatepos(i, i + 1) | 
 | 221 |                 else: | 
 | 222 |                     break | 
 | 223 |             else: | 
 | 224 |                 assert 0, "interesting.search() lied" | 
 | 225 |         # end while | 
| Ezio Melotti | 00dc60b | 2011-11-18 18:00:40 +0200 | [diff] [blame] | 226 |         if end and i < n and not self.cdata_elem: | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 227 |             self.handle_data(rawdata[i:n]) | 
 | 228 |             i = self.updatepos(i, n) | 
 | 229 |         self.rawdata = rawdata[i:] | 
 | 230 |  | 
| Ezio Melotti | 4b92cc3 | 2012-02-13 16:10:44 +0200 | [diff] [blame] | 231 |     # Internal -- parse html declarations, return length or -1 if not terminated | 
 | 232 |     # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state | 
 | 233 |     # See also parse_declaration in _markupbase | 
 | 234 |     def parse_html_declaration(self, i): | 
 | 235 |         rawdata = self.rawdata | 
 | 236 |         if rawdata[i:i+2] != '<!': | 
 | 237 |             self.error('unexpected call to parse_html_declaration()') | 
 | 238 |         if rawdata[i:i+4] == '<!--': | 
| Ezio Melotti | 369cbd7 | 2012-02-13 20:36:55 +0200 | [diff] [blame] | 239 |             # this case is actually already handled in goahead() | 
| Ezio Melotti | 4b92cc3 | 2012-02-13 16:10:44 +0200 | [diff] [blame] | 240 |             return self.parse_comment(i) | 
 | 241 |         elif rawdata[i:i+3] == '<![': | 
 | 242 |             return self.parse_marked_section(i) | 
 | 243 |         elif rawdata[i:i+9].lower() == '<!doctype': | 
 | 244 |             # find the closing > | 
| Ezio Melotti | 369cbd7 | 2012-02-13 20:36:55 +0200 | [diff] [blame] | 245 |             gtpos = rawdata.find('>', i+9) | 
| Ezio Melotti | 4b92cc3 | 2012-02-13 16:10:44 +0200 | [diff] [blame] | 246 |             if gtpos == -1: | 
 | 247 |                 return -1 | 
 | 248 |             self.handle_decl(rawdata[i+2:gtpos]) | 
 | 249 |             return gtpos+1 | 
 | 250 |         else: | 
 | 251 |             return self.parse_bogus_comment(i) | 
 | 252 |  | 
 | 253 |     # Internal -- parse bogus comment, return length or -1 if not terminated | 
 | 254 |     # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state | 
 | 255 |     def parse_bogus_comment(self, i, report=1): | 
 | 256 |         rawdata = self.rawdata | 
| Ezio Melotti | f117443 | 2012-02-13 16:28:54 +0200 | [diff] [blame] | 257 |         if rawdata[i:i+2] not in ('<!', '</'): | 
| Ezio Melotti | 4b92cc3 | 2012-02-13 16:10:44 +0200 | [diff] [blame] | 258 |             self.error('unexpected call to parse_comment()') | 
 | 259 |         pos = rawdata.find('>', i+2) | 
 | 260 |         if pos == -1: | 
 | 261 |             return -1 | 
 | 262 |         if report: | 
 | 263 |             self.handle_comment(rawdata[i+2:pos]) | 
 | 264 |         return pos + 1 | 
 | 265 |  | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 266 |     # Internal -- parse processing instr, return end or -1 if not terminated | 
 | 267 |     def parse_pi(self, i): | 
 | 268 |         rawdata = self.rawdata | 
 | 269 |         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' | 
 | 270 |         match = piclose.search(rawdata, i+2) # > | 
 | 271 |         if not match: | 
 | 272 |             return -1 | 
 | 273 |         j = match.start() | 
 | 274 |         self.handle_pi(rawdata[i+2: j]) | 
 | 275 |         j = match.end() | 
 | 276 |         return j | 
 | 277 |  | 
 | 278 |     # Internal -- handle starttag, return end or -1 if not terminated | 
 | 279 |     def parse_starttag(self, i): | 
 | 280 |         self.__starttag_text = None | 
 | 281 |         endpos = self.check_for_whole_start_tag(i) | 
 | 282 |         if endpos < 0: | 
 | 283 |             return endpos | 
 | 284 |         rawdata = self.rawdata | 
 | 285 |         self.__starttag_text = rawdata[i:endpos] | 
 | 286 |  | 
 | 287 |         # Now parse the data between i+1 and j into a tag and attrs | 
 | 288 |         attrs = [] | 
 | 289 |         match = tagfind.match(rawdata, i+1) | 
 | 290 |         assert match, 'unexpected call to parse_starttag()' | 
 | 291 |         k = match.end() | 
 | 292 |         self.lasttag = tag = rawdata[i+1:k].lower() | 
 | 293 |  | 
 | 294 |         while k < endpos: | 
 | 295 |             m = attrfind.match(rawdata, k) | 
 | 296 |             if not m: | 
 | 297 |                 break | 
 | 298 |             attrname, rest, attrvalue = m.group(1, 2, 3) | 
 | 299 |             if not rest: | 
 | 300 |                 attrvalue = None | 
 | 301 |             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ | 
 | 302 |                  attrvalue[:1] == '"' == attrvalue[-1:]: | 
 | 303 |                 attrvalue = attrvalue[1:-1] | 
| Ezio Melotti | 0f1571c | 2011-11-14 18:04:05 +0200 | [diff] [blame] | 304 |             if attrvalue: | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 305 |                 attrvalue = self.unescape(attrvalue) | 
 | 306 |             attrs.append((attrname.lower(), attrvalue)) | 
 | 307 |             k = m.end() | 
 | 308 |  | 
 | 309 |         end = rawdata[k:endpos].strip() | 
 | 310 |         if end not in (">", "/>"): | 
 | 311 |             lineno, offset = self.getpos() | 
 | 312 |             if "\n" in self.__starttag_text: | 
 | 313 |                 lineno = lineno + self.__starttag_text.count("\n") | 
 | 314 |                 offset = len(self.__starttag_text) \ | 
 | 315 |                          - self.__starttag_text.rfind("\n") | 
 | 316 |             else: | 
 | 317 |                 offset = offset + len(self.__starttag_text) | 
 | 318 |             self.error("junk characters in start tag: %r" | 
 | 319 |                        % (rawdata[k:endpos][:20],)) | 
 | 320 |         if end.endswith('/>'): | 
 | 321 |             # XHTML-style empty tag: <span attr="value" /> | 
 | 322 |             self.handle_startendtag(tag, attrs) | 
 | 323 |         else: | 
 | 324 |             self.handle_starttag(tag, attrs) | 
 | 325 |             if tag in self.CDATA_CONTENT_ELEMENTS: | 
| Ezio Melotti | 7e82b27 | 2011-11-01 14:09:56 +0200 | [diff] [blame] | 326 |                 self.set_cdata_mode(tag) | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 327 |         return endpos | 
 | 328 |  | 
 | 329 |     # Internal -- check to see if we have a complete starttag; return end | 
 | 330 |     # or -1 if incomplete. | 
 | 331 |     def check_for_whole_start_tag(self, i): | 
 | 332 |         rawdata = self.rawdata | 
 | 333 |         m = locatestarttagend.match(rawdata, i) | 
 | 334 |         if m: | 
 | 335 |             j = m.end() | 
 | 336 |             next = rawdata[j:j+1] | 
 | 337 |             if next == ">": | 
 | 338 |                 return j + 1 | 
 | 339 |             if next == "/": | 
 | 340 |                 if rawdata.startswith("/>", j): | 
 | 341 |                     return j + 2 | 
 | 342 |                 if rawdata.startswith("/", j): | 
 | 343 |                     # buffer boundary | 
 | 344 |                     return -1 | 
 | 345 |                 # else bogus input | 
 | 346 |                 self.updatepos(i, j + 1) | 
 | 347 |                 self.error("malformed empty start tag") | 
 | 348 |             if next == "": | 
 | 349 |                 # end of input | 
 | 350 |                 return -1 | 
 | 351 |             if next in ("abcdefghijklmnopqrstuvwxyz=/" | 
 | 352 |                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): | 
 | 353 |                 # end of input in or before attribute value, or we have the | 
 | 354 |                 # '/' from a '/>' ending | 
 | 355 |                 return -1 | 
 | 356 |             self.updatepos(i, j) | 
 | 357 |             self.error("malformed start tag") | 
 | 358 |         raise AssertionError("we should not get here!") | 
 | 359 |  | 
 | 360 |     # Internal -- parse endtag, return end or -1 if incomplete | 
 | 361 |     def parse_endtag(self, i): | 
 | 362 |         rawdata = self.rawdata | 
 | 363 |         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" | 
 | 364 |         match = endendtag.search(rawdata, i+1) # > | 
 | 365 |         if not match: | 
 | 366 |             return -1 | 
| Ezio Melotti | f117443 | 2012-02-13 16:28:54 +0200 | [diff] [blame] | 367 |         gtpos = match.end() | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 368 |         match = endtagfind.match(rawdata, i) # </ + tag + > | 
 | 369 |         if not match: | 
| Ezio Melotti | 7e82b27 | 2011-11-01 14:09:56 +0200 | [diff] [blame] | 370 |             if self.cdata_elem is not None: | 
| Ezio Melotti | f117443 | 2012-02-13 16:28:54 +0200 | [diff] [blame] | 371 |                 self.handle_data(rawdata[i:gtpos]) | 
 | 372 |                 return gtpos | 
 | 373 |             # find the name: w3.org/TR/html5/tokenization.html#tag-name-state | 
 | 374 |             namematch = tagfind_tolerant.match(rawdata, i+2) | 
 | 375 |             if not namematch: | 
 | 376 |                 # w3.org/TR/html5/tokenization.html#end-tag-open-state | 
 | 377 |                 if rawdata[i:i+3] == '</>': | 
 | 378 |                     return i+3 | 
 | 379 |                 else: | 
 | 380 |                     return self.parse_bogus_comment(i) | 
 | 381 |             tagname = namematch.group().lower() | 
 | 382 |             # consume and ignore other stuff between the name and the > | 
 | 383 |             # Note: this is not 100% correct, since we might have things like | 
 | 384 |             # </tag attr=">">, but looking for > after tha name should cover | 
 | 385 |             # most of the cases and is much simpler | 
 | 386 |             gtpos = rawdata.find('>', namematch.end()) | 
 | 387 |             self.handle_endtag(tagname) | 
 | 388 |             return gtpos+1 | 
| Ezio Melotti | 7e82b27 | 2011-11-01 14:09:56 +0200 | [diff] [blame] | 389 |  | 
 | 390 |         elem = match.group(1).lower() # script or style | 
 | 391 |         if self.cdata_elem is not None: | 
 | 392 |             if elem != self.cdata_elem: | 
| Ezio Melotti | f117443 | 2012-02-13 16:28:54 +0200 | [diff] [blame] | 393 |                 self.handle_data(rawdata[i:gtpos]) | 
 | 394 |                 return gtpos | 
| Ezio Melotti | 7e82b27 | 2011-11-01 14:09:56 +0200 | [diff] [blame] | 395 |  | 
 | 396 |         self.handle_endtag(elem) | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 397 |         self.clear_cdata_mode() | 
| Ezio Melotti | f117443 | 2012-02-13 16:28:54 +0200 | [diff] [blame] | 398 |         return gtpos | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 399 |  | 
 | 400 |     # Overridable -- finish processing of start+end tag: <tag.../> | 
 | 401 |     def handle_startendtag(self, tag, attrs): | 
 | 402 |         self.handle_starttag(tag, attrs) | 
 | 403 |         self.handle_endtag(tag) | 
 | 404 |  | 
 | 405 |     # Overridable -- handle start tag | 
 | 406 |     def handle_starttag(self, tag, attrs): | 
 | 407 |         pass | 
 | 408 |  | 
 | 409 |     # Overridable -- handle end tag | 
 | 410 |     def handle_endtag(self, tag): | 
 | 411 |         pass | 
 | 412 |  | 
 | 413 |     # Overridable -- handle character reference | 
 | 414 |     def handle_charref(self, name): | 
 | 415 |         pass | 
 | 416 |  | 
 | 417 |     # Overridable -- handle entity reference | 
 | 418 |     def handle_entityref(self, name): | 
 | 419 |         pass | 
 | 420 |  | 
 | 421 |     # Overridable -- handle data | 
 | 422 |     def handle_data(self, data): | 
 | 423 |         pass | 
 | 424 |  | 
 | 425 |     # Overridable -- handle comment | 
 | 426 |     def handle_comment(self, data): | 
 | 427 |         pass | 
 | 428 |  | 
 | 429 |     # Overridable -- handle declaration | 
 | 430 |     def handle_decl(self, decl): | 
 | 431 |         pass | 
 | 432 |  | 
 | 433 |     # Overridable -- handle processing instruction | 
 | 434 |     def handle_pi(self, data): | 
 | 435 |         pass | 
 | 436 |  | 
 | 437 |     def unknown_decl(self, data): | 
| Ezio Melotti | 369cbd7 | 2012-02-13 20:36:55 +0200 | [diff] [blame] | 438 |         pass | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 439 |  | 
 | 440 |     # Internal -- helper to remove special character quoting | 
 | 441 |     entitydefs = None | 
 | 442 |     def unescape(self, s): | 
 | 443 |         if '&' not in s: | 
 | 444 |             return s | 
 | 445 |         def replaceEntities(s): | 
 | 446 |             s = s.groups()[0] | 
| Senthil Kumaran | 3f60f09 | 2010-12-28 16:05:07 +0000 | [diff] [blame] | 447 |             try: | 
 | 448 |                 if s[0] == "#": | 
 | 449 |                     s = s[1:] | 
 | 450 |                     if s[0] in ['x','X']: | 
 | 451 |                         c = int(s[1:], 16) | 
 | 452 |                     else: | 
 | 453 |                         c = int(s) | 
 | 454 |                     return unichr(c) | 
 | 455 |             except ValueError: | 
 | 456 |                 return '&#'+s+';' | 
| Fred Drake | d995e11 | 2008-05-20 06:08:38 +0000 | [diff] [blame] | 457 |             else: | 
 | 458 |                 # Cannot use name2codepoint directly, because HTMLParser supports apos, | 
 | 459 |                 # which is not part of HTML 4 | 
 | 460 |                 import htmlentitydefs | 
 | 461 |                 if HTMLParser.entitydefs is None: | 
 | 462 |                     entitydefs = HTMLParser.entitydefs = {'apos':u"'"} | 
 | 463 |                     for k, v in htmlentitydefs.name2codepoint.iteritems(): | 
 | 464 |                         entitydefs[k] = unichr(v) | 
 | 465 |                 try: | 
 | 466 |                     return self.entitydefs[s] | 
 | 467 |                 except KeyError: | 
 | 468 |                     return '&'+s+';' | 
 | 469 |  | 
 | 470 |         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s) |