Sjoerd's latest.
diff --git a/Lib/xmllib.py b/Lib/xmllib.py
index 6d7f1d1..355714f 100644
--- a/Lib/xmllib.py
+++ b/Lib/xmllib.py
@@ -5,34 +5,50 @@
 import string
 
 
+version = '0.1'
+
 # Regular expressions used for parsing
 
 _S = '[ \t\r\n]+'
 _opS = '[ \t\r\n]*'
 _Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'
-interesting = re.compile('[&<]')
-incomplete = re.compile('&(' + _Name + '|#[0-9]*|#x[0-9a-fA-F]*)?|'
-                           '<([a-zA-Z_:][^<>]*|'
-                              '/([a-zA-Z_:][^<>]*)?|'
-                              '![^<>]*|'
-                              r'\?[^<>]*)?')
+illegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in content
+interesting = re.compile('[]&<]')
 
-ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+);?')
+amp = re.compile('&')
+ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')
 entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')
 charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')
-space = re.compile(_S)
+space = re.compile(_S + '$')
 newline = re.compile('\n')
 
 starttagopen = re.compile('<' + _Name)
 endtagopen = re.compile('</')
 starttagend = re.compile(_opS + '(?P<slash>/?)>')
-endbracket = re.compile('>')
+endbracket = re.compile(_opS + '>')
 tagfind = re.compile(_Name)
 cdataopen = re.compile(r'<!\[CDATA\[')
 cdataclose = re.compile(r'\]\]>')
-doctype = re.compile('<!DOCTYPE' + _S + '(?P<name>' + _Name + ')' + _S)
-special = re.compile('<!(?P<special>[^<>]*)>')
-procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _S)
+# this matches one of the following:
+# SYSTEM SystemLiteral
+# PUBLIC PubidLiteral SystemLiteral
+_SystemLiteral = '(?P<%s>\'[^\']*\'|"[^"]*")'
+_PublicLiteral = '(?P<%s>"[-\'()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \
+                        "'[-()+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"
+_ExternalId = '(?:SYSTEM|' \
+                 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \
+              ')'+_S+_SystemLiteral%'syslit'
+doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'
+                     '(?:'+_S+_ExternalId+')?'+_opS)
+xmldecl = re.compile('<\?xml'+_S+
+                     'version'+_opS+'='+_opS+'(?P<version>\'[^\']*\'|"[^"]*")'+
+                     '(?:'+_S+'encoding'+_opS+'='+_opS+
+                        "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"
+                        '"[A-Za-z][-A-Za-z0-9._]*"))?'
+                     '(?:'+_S+'standalone'+_opS+'='+_opS+
+                        '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+
+                     _opS+'\?>')
+procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)
 procclose = re.compile(_opS + r'\?>')
 commentopen = re.compile('<!--')
 commentclose = re.compile('-->')
@@ -41,6 +57,7 @@
     _S + '(?P<name>' + _Name + ')'
     '(' + _opS + '=' + _opS +
     '(?P<value>\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9.:+*%?!()_#=~]+))')
+attrtrans = string.maketrans(' \r\n\t', '    ')
 
 
 # XML parser base class -- find tags and call handler functions.
@@ -92,30 +109,43 @@
         self.goahead(1)
 
     # Interface -- translate references
-    def translate_references(self, data):
-        newdata = []
+    def translate_references(self, data, all = 1):
         i = 0
         while 1:
-            res = ref.search(data, i)
+            res = amp.search(data, i)
             if res is None:
-                newdata.append(data[i:])
-                return string.join(newdata, '')
-            if data[res.end(0) - 1] != ';':
+                return data
+            res = ref.match(data, res.start(0))
+            if res is None:
+                self.syntax_error("bogus `&'")
+                i =i+1
+                continue
+            i = res.end(0)
+            if data[i - 1] != ';':
                 self.syntax_error("`;' missing after entity/char reference")
-            newdata.append(data[i:res.start(0)])
+                i = i-1
             str = res.group(1)
+            pre = data[:res.start(0)]
+            post = data[i:]
             if str[0] == '#':
                 if str[1] == 'x':
-                    newdata.append(chr(string.atoi(str[2:], 16)))
+                    str = chr(string.atoi(str[2:], 16))
                 else:
-                    newdata.append(chr(string.atoi(str[1:])))
-            else:
-                try:
-                    newdata.append(self.entitydefs[str])
-                except KeyError:
+                    str = chr(string.atoi(str[1:]))
+                data = pre + str + post
+                i = res.start(0)+len(str)
+            elif all:
+                if self.entitydefs.has_key(str):
+                    data = pre + self.entitydefs[str] + post
+                    i = res.start(0)    # rescan substituted text
+                else:
+                    self.syntax_error('reference to unknown entity')
                     # can't do it, so keep the entity ref in
-                    newdata.append('&' + str + ';')
-            i = res.end(0)
+                    data = pre + '&' + str + ';' + post
+                    i = res.start(0) + len(str) + 2
+            else:
+                # just translating character references
+                pass                    # i is already postioned correctly
 
     # Internal -- handle data as far as reasonable.  May leave state
     # and data to be processed by a subsequent call.  If 'end' is
@@ -139,8 +169,14 @@
             else:
                     j = n
             if i < j:
+                if self.__at_start:
+                    self.syntax_error('illegal data at start of file')
                 self.__at_start = 0
                 data = rawdata[i:j]
+                if not self.stack and not space.match(data):
+                    self.syntax_error('data not in content')
+                if illegal.search(data):
+                    self.syntax_error('illegal character in content')
                 self.handle_data(data)
                 self.lineno = self.lineno + string.count(data, '\n')
             i = j
@@ -184,6 +220,20 @@
                     self.lineno = self.lineno + string.count(rawdata[i:i], '\n')
                     i = k
                     continue
+                res = xmldecl.match(rawdata, i)
+                if res:
+                    if not self.__at_start:
+                        self.syntax_error("<?xml?> declaration not at start of document")
+                    version, encoding, standalone = res.group('version',
+                                                              'encoding',
+                                                              'standalone')
+                    if version[1:-1] != '1.0':
+                        raise RuntimeError, 'only XML version 1.0 supported'
+                    if encoding: encoding = encoding[1:-1]
+                    if standalone: standalone = standalone[1:-1]
+                    self.handle_xml(encoding, standalone)
+                    i = res.end(0)
+                    continue
                 res = procopen.match(rawdata, i)
                 if res:
                     k = self.parse_proc(i)
@@ -209,18 +259,6 @@
                     self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
                     i = k
                     continue
-                res = special.match(rawdata, i)
-                if res:
-                    if self.literal:
-                        data = rawdata[i]
-                        self.handle_data(data)
-                        self.lineno = self.lineno + string.count(data, '\n')
-                        i = i+1
-                        continue
-                    self.handle_special(res.group('special'))
-                    self.lineno = self.lineno + string.count(res.group(0), '\n')
-                    i = res.end(0)
-                    continue
             elif rawdata[i] == '&':
                 res = charref.match(rawdata, i)
                 if res is not None:
@@ -228,6 +266,8 @@
                     if rawdata[i-1] != ';':
                         self.syntax_error("`;' missing in charref")
                         i = i-1
+                    if not self.stack:
+                        self.syntax_error('data not in content')
                     self.handle_charref(res.group('char')[:-1])
                     self.lineno = self.lineno + string.count(res.group(0), '\n')
                     continue
@@ -237,36 +277,45 @@
                     if rawdata[i-1] != ';':
                         self.syntax_error("`;' missing in entityref")
                         i = i-1
-                    self.handle_entityref(res.group('name'))
+                    name = res.group('name')
+                    if self.entitydefs.has_key(name):
+                        self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
+                        n = len(rawdata)
+                        i = res.start(0)
+                    else:
+                        self.syntax_error('reference to unknown entity')
+                        self.unknown_entityref(name)
                     self.lineno = self.lineno + string.count(res.group(0), '\n')
                     continue
+            elif rawdata[i] == ']':
+                if n-i < 3:
+                    break
+                if cdataclose.match(rawdata, i):
+                    self.syntax_error("bogus `]]>'")
+                self.handle_data(rawdata[i])
+                i = i+1
+                continue
             else:
                 raise RuntimeError, 'neither < nor & ??'
             # We get here only if incomplete matches but
             # nothing else
-            res = incomplete.match(rawdata, i)
-            if not res:
-                data = rawdata[i]
-                self.handle_data(data)
-                self.lineno = self.lineno + string.count(data, '\n')
-                i = i+1
-                continue
-            j = res.end(0)
-            if j == n:
-                break # Really incomplete
-            self.syntax_error("bogus `<' or `&'")
-            data = res.group(0)
-            self.handle_data(data)
-            self.lineno = self.lineno + string.count(data, '\n')
-            i = j
+            break
         # end while
+        if i > 0:
+            self.__at_start = 0
         if end and i < n:
-            data = rawdata[i:n]
+            data = rawdata[i]
+            self.syntax_error("bogus `%s'" % data)
+            if illegal.search(data):
+                self.syntax_error('illegal character in content')
             self.handle_data(data)
             self.lineno = self.lineno + string.count(data, '\n')
-            i = n
+            self.rawdata = rawdata[i+1:]
+            return self.goahead(end)
         self.rawdata = rawdata[i:]
         if end:
+            if not self.__seen_starttag:
+                self.syntax_error('no elements in file')
             if self.stack:
                 self.syntax_error('missing end tags')
                 while self.stack:
@@ -280,9 +329,12 @@
         res = commentclose.search(rawdata, i+4)
         if not res:
             return -1
-        # doubledash search will succeed because it's a subset of commentclose
-        if doubledash.search(rawdata, i+4).start(0) < res.start(0):
+        if doubledash.search(rawdata, i+4, res.start(0)):
             self.syntax_error("`--' inside comment")
+        if rawdata[res.start(0)-1] == '-':
+            self.syntax_error('comment cannot end in three dashes')
+        if illegal.search(rawdata, i+4, res.start(0)):
+            self.syntax_error('illegal character in comment')
         self.handle_comment(rawdata[i+4: res.start(0)])
         return res.end(0)
 
@@ -291,28 +343,59 @@
         rawdata = self.rawdata
         n = len(rawdata)
         name = res.group('name')
+        pubid, syslit = res.group('pubid', 'syslit')
+        if pubid is not None:
+            pubid = pubid[1:-1]         # remove quotes
+            pubid = string.join(string.split(pubid)) # normalize
+        if syslit is not None: syslit = syslit[1:-1] # remove quotes
         j = k = res.end(0)
-        level = 0
-        while k < n:
-            c = rawdata[k]
-            if c == '<':
-                level = level + 1
-            elif c == '>':
-                if level == 0:
-                    self.handle_doctype(name, rawdata[j:k])
-                    return k+1
-                level = level - 1
+        if k >= n:
+            return -1
+        if rawdata[k] == '[':
+            level = 0
             k = k+1
-        return -1
+            dq = sq = 0
+            while k < n:
+                c = rawdata[k]
+                if not sq and c == '"':
+                    dq = not dq
+                elif not dq and c == "'":
+                    sq = not sq
+                elif sq or dq:
+                    pass
+                elif level <= 0 and c == ']':
+                    res = endbracket.match(rawdata, k+1)
+                    if not res:
+                        return -1
+                    self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])
+                    return res.end(0)
+                elif c == '<':
+                    level = level + 1
+                elif c == '>':
+                    level = level - 1
+                    if level < 0:
+                        self.syntax_error("bogus `>' in DOCTYPE")
+                k = k+1
+        res = endbracket.search(rawdata, k)
+        if not res:
+            return -1
+        if res.start(0) != k:
+            self.syntax_error('garbage in DOCTYPE')
+        self.handle_doctype(name, pubid, syslit, None)
+        return res.end(0)
 
     # Internal -- handle CDATA tag, return length or -1 if not terminated
     def parse_cdata(self, i):
         rawdata = self.rawdata
         if rawdata[i:i+9] <> '<![CDATA[':
-            raise RuntimeError, 'unexpected call to handle_cdata'
+            raise RuntimeError, 'unexpected call to parse_cdata'
         res = cdataclose.search(rawdata, i+9)
         if not res:
             return -1
+        if illegal.search(rawdata, i+9, res.start(0)):
+            self.syntax_error('illegal character in CDATA')
+        if not self.stack:
+            self.syntax_error('CDATA not in content')
         self.handle_cdata(rawdata[i+9:res.start(0)])
         return res.end(0)
 
@@ -324,24 +407,15 @@
         if not end:
             return -1
         j = end.start(0)
+        if illegal.search(rawdata, i+2, j):
+            self.syntax_error('illegal character in processing instruction')
         res = tagfind.match(rawdata, i+2)
         if not res:
             raise RuntimeError, 'unexpected call to parse_proc'
         k = res.end(0)
         name = res.group(0)
-        if name == 'xml':
-            if self.__at_start:
-                attrdict, k = self.parse_attributes('xml', k, j,
-                                                    self.__xml_attributes)
-                if k != j:
-                    self.syntax_error('garbage at end of <?xml?>')
-                if attrdict['version'] != '1.0':
-                    self.syntax_error('only XML version 1.0 supported')
-                self.handle_xml(attrdict.get('encoding', None),
-                                attrdict['standalone'])
-                return end.end(0)
-            else:
-                self.syntax_error("<?xml?> tag not at start of document")
+        if string.find(string.lower(name), 'xml') >= 0:
+            self.syntax_error('illegal processing instruction target name')
         self.handle_proc(name, rawdata[k:j])
         return end.end(0)
 
@@ -375,6 +449,7 @@
                                   (attrname, tag))
             if attrdict.has_key(attrname):
                 self.syntax_error('attribute specified twice')
+            attrvalue = string.translate(attrvalue, attrtrans)
             attrdict[attrname] = self.translate_references(attrvalue)
             k = res.end(0)
         if attributes is not None:
@@ -400,6 +475,8 @@
         if not self.__seen_starttag and self.__seen_doctype:
             if tag != self.__seen_doctype:
                 self.syntax_error('starttag does not match DOCTYPE')
+        if self.__seen_starttag and not self.stack:
+            self.syntax_error('multiple elements on top level')
         if hasattr(self, tag + '_attributes'):
             attributes = getattr(self, tag + '_attributes')
         else:
@@ -428,10 +505,7 @@
             tag = res.group(0)
             k = res.end(0)
         if k != end.start(0):
-            # check that there is only white space at end of tag
-            res = space.match(rawdata, k)
-            if res is None or res.end(0) != end.start(0):
-                self.syntax_error('garbage in end tag')
+            self.syntax_error('garbage in end tag')
         self.finish_endtag(tag)
         return end.end(0)
 
@@ -439,17 +513,18 @@
     # Return -1 for unknown tag, 1 for balanced tag
     def finish_starttag(self, tag, attrs):
         self.stack.append(tag)
-        try:
-            method = getattr(self, 'start_' + tag)
-        except AttributeError:
-            self.unknown_starttag(tag, attrs)
-            return -1
-        else:
+        methodname = 'start_' + tag
+        if hasattr(self, methodname):
+            method = getattr(self, methodname)
             self.handle_starttag(tag, method, attrs)
             return 1
+        else:
+            self.unknown_starttag(tag, attrs)
+            return -1
 
     # Internal -- finish processing of end tag
     def finish_endtag(self, tag):
+        methodname = 'end_' + tag
         if not tag:
             self.syntax_error('name-less end tag')
             found = len(self.stack) - 1
@@ -459,9 +534,10 @@
         else:
             if tag not in self.stack:
                 self.syntax_error('unopened end tag')
-                try:
-                    method = getattr(self, 'end_' + tag)
-                except AttributeError:
+                if hasattr(self, methodname):
+                    method = getattr(self, methodname)
+                    self.handle_endtag(tag, method)
+                else:
                     self.unknown_endtag(tag)
                 return
             found = len(self.stack)
@@ -472,11 +548,8 @@
             if found < len(self.stack) - 1:
                 self.syntax_error('missing close tag for %s' % self.stack[-1])
             tag = self.stack[-1]
-            try:
-                method = getattr(self, 'end_' + tag)
-            except AttributeError:
-                method = None
-            if method:
+            if hasattr(self, methodname):
+                method = getattr(self, methodname)
                 self.handle_endtag(tag, method)
             else:
                 self.unknown_endtag(tag)
@@ -487,7 +560,7 @@
         pass
 
     # Overridable -- handle DOCTYPE
-    def handle_doctype(self, tag, data):
+    def handle_doctype(self, tag, pubid, syslit, data):
         pass
 
     # Overridable -- handle start tag
@@ -514,7 +587,12 @@
         self.handle_data(chr(n))
 
     # Definition of entities -- derived classes may override
-    entitydefs = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': "'"}
+    entitydefs = {'lt': '&#60;',        # must use charref
+                  'gt': '&#62;',
+                  'amp': '&#38;',       # must use charref
+                  'quot': '&#34;',
+                  'apos': '&#39;',
+                  }
 
     # Example -- handle entity reference, no need to override
     def handle_entityref(self, name):
@@ -541,10 +619,6 @@
     def handle_proc(self, name, data):
         pass
 
-    # Example -- handle special instructions, could be overridden
-    def handle_special(self, data):
-        pass
-
     # Example -- handle relatively harmless syntax errors, could be overridden
     def syntax_error(self, message):
         raise RuntimeError, 'Syntax error at line %d: %s' % (self.lineno, message)
@@ -566,10 +640,14 @@
         self.flush()
         print 'xml: encoding =',encoding,'standalone =',standalone
 
-    def handle_doctype(self, tag, data):
+    def handle_doctype(self, tag, pubid, syslit, data):
         self.flush()
         print 'DOCTYPE:',tag, `data`
 
+    def handle_entity(self, name, strval, pubid, syslit, ndata):
+        self.flush()
+        print 'ENTITY:',`data`
+
     def handle_data(self, data):
         self.testdata = self.testdata + data
         if len(`self.testdata`) >= 70:
@@ -589,10 +667,6 @@
         self.flush()
         print 'processing:',name,`data`
 
-    def handle_special(self, data):
-        self.flush()
-        print 'special:',`data`
-
     def handle_comment(self, data):
         self.flush()
         r = `data`
@@ -660,9 +734,13 @@
         f.close()
 
     x = klass()
-    for c in data:
-        x.feed(c)
-    x.close()
+    try:
+        for c in data:
+            x.feed(c)
+        x.close()
+    except RuntimeError, msg:
+        print msg
+        sys.exit(1)
 
 
 if __name__ == '__main__':