In CDATA mode, make sure entity-reference syntax is not interpreted; entity references are not allowed in that mode. Do a better job of scanning <!DOCTYPE ...> declarations; based on the code in HTMLParser.py.

commit: fb38c76e0f15e15d08e4635a24719cc120809191 [log] [tgz]
author: Fred Drake <fdrake@acm.org> Mon Jul 16 18:30:35 2001 +0000
committer: Fred Drake <fdrake@acm.org> Mon Jul 16 18:30:35 2001 +0000
tree: 84f02d0e0bf37352e792425f82d6aed4b2c614ca
parent: e16c7aee4bc2a8851b9a9bae60a00c2544722f67 [diff]
diff --git a/Lib/sgmllib.py b/Lib/sgmllib.py
index 5ff9f70..3422980 100644
--- a/Lib/sgmllib.py
+++ b/Lib/sgmllib.py

@@ -5,7 +5,8 @@
 # XXX There should be a way to distinguish between PCDATA (parsed
 # character data -- the normal case), RCDATA (replaceable character
 # data -- only char and entity references and end tags are special)
-# and CDATA (character data -- only end tags are special).
+# and CDATA (character data -- only end tags are special).  RCDATA is
+# not supported at all.
 
 
 import re
@@ -34,6 +35,9 @@
 special = re.compile('<![^<>]*>')
 commentopen = re.compile('<!--')
 commentclose = re.compile(r'--\s*>')
+declopen = re.compile('<!')
+declname = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*')
+declstringlit = re.compile(r'(\'[^\']*\'|"[^"]*")\s*')
 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
 attrfind = re.compile(
     r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
@@ -160,6 +164,10 @@
                     i = k
                     continue
             elif rawdata[i] == '&':
+                if self.literal:
+                    self.handle_data(rawdata[i])
+                    i = i+1
+                    continue
                 match = charref.match(rawdata, i)
                 if match:
                     name = match.group(1)
@@ -210,11 +218,20 @@
 
     # Internal -- parse declaration.
     def parse_declaration(self, i):
+        # This is some sort of declaration; in "HTML as
+        # deployed," this should only be the document type
+        # declaration ("<!DOCTYPE html...>").
         rawdata = self.rawdata
         j = i + 2
+        assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
+        if rawdata[j:j+1] in ("-", ""):
+            # Start of comment followed by buffer boundary,
+            # or just a buffer boundary.
+            return -1
+        # in practice, this should look like: ((name|stringlit) S*)+ '>'
         n = len(rawdata)
         while j < n:
-            c = rawdata[j:j+1]
+            c = rawdata[j]
             if c == ">":
                 # end of declaration syntax
                 self.handle_decl(rawdata[i+2:j])
@@ -222,15 +239,16 @@
             if c in "\"'":
                 m = declstringlit.match(rawdata, j)
                 if not m:
-                    # incomplete or an error?
-                    return -1
+                    return -1 # incomplete
+                j = m.end()
+            elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
+                m = declname.match(rawdata, j)
+                if not m:
+                    return -1 # incomplete
                 j = m.end()
             else:
-                m = decldata.match(rawdata, j)
-                if not m:
-                    # incomplete or an error?
-                    return -1
-                j = m.end()
+                raise SGMLParseError(
+                    "unexpected char in declaration: %s" % `rawdata[j]`)
         # end of buffer between tokens
         return -1
commit	fb38c76e0f15e15d08e4635a24719cc120809191	[log] [tgz]
author	Fred Drake <fdrake@acm.org>	Mon Jul 16 18:30:35 2001 +0000
committer	Fred Drake <fdrake@acm.org>	Mon Jul 16 18:30:35 2001 +0000
tree	84f02d0e0bf37352e792425f82d6aed4b2c614ca
parent	e16c7aee4bc2a8851b9a9bae60a00c2544722f67 [diff]