Sjoerd Mullender writes: """ Added some optional arguments to the XMLParser __init__ method to specify that selected non-standard constructs are to be accepted. Also removed the documentation for handle_entityrefs since it isn't used. """ The version is incremented to 0.3.

commit: b35d6846d9ae6b1d23234b15cb5d8240eeb6c7d9 [log] [tgz]
author: Guido van Rossum <guido@python.org> Thu Aug 26 15:52:33 1999 +0000
committer: Guido van Rossum <guido@python.org> Thu Aug 26 15:52:33 1999 +0000
tree: 8cc8bb8d026e9849fb4d0631a9e1a53180496287
parent: 3601e88cb3a41f805216e9b9feda591f678f4014 [diff] [blame]
diff --git a/Lib/xmllib.py b/Lib/xmllib.py
index c74f71e..dcfe872 100644
--- a/Lib/xmllib.py
+++ b/Lib/xmllib.py

@@ -5,7 +5,7 @@
 import string
 
 
-version = '0.2'
+version = '0.3'
 
 # Regular expressions used for parsing
 
@@ -78,17 +78,29 @@
 # special names to handle tags: start_foo and end_foo to handle <foo>
 # and </foo>, respectively.  The data between tags is passed to the
 # parser by calling self.handle_data() with some data as argument (the
-# data may be split up in arbutrary chunks).  Entity references are
-# passed by calling self.handle_entityref() with the entity reference
-# as argument.
+# data may be split up in arbutrary chunks).
 
 class XMLParser:
     attributes = {}                     # default, to be overridden
     elements = {}                       # default, to be overridden
 
+    # parsing options, settable using keyword args in __init__
+    __accept_unquoted_attributes = 0
+    __accept_missing_endtag_name = 0
+    __map_case = 0
+    __accept_utf8 = 0
+
     # Interface -- initialize and reset this instance
-    def __init__(self):
+    def __init__(self, **kw):
         self.__fixed = 0
+        if kw.has_key('accept_unquoted_attributes'):
+            self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']
+        if kw.has_key('accept_missing_endtag_name'):
+            self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']
+        if kw.has_key('map_case'):
+            self.__map_case = kw['map_case']
+        if kw.has_key('accept_utf8'):
+            self.__accept_utf8 = kw['accept_utf8']
         self.reset()
 
     def __fixelements(self):
@@ -223,7 +235,7 @@
                 self.__at_start = 0
                 if not self.stack and space.match(data) is None:
                     self.syntax_error('data not in content')
-                if illegal.search(data):
+                if not self.__accept_utf8 and illegal.search(data):
                     self.syntax_error('illegal character in content')
                 self.handle_data(data)
                 self.lineno = self.lineno + string.count(data, '\n')
@@ -303,6 +315,8 @@
                     k = self.parse_doctype(res)
                     if k < 0: break
                     self.__seen_doctype = res.group('name')
+                    if self.__map_case:
+                        self.__seen_doctype = string.lower(self.__seen_doctype)
                     self.lineno = self.lineno + string.count(rawdata[i:k], '\n')
                     i = k
                     continue
@@ -330,12 +344,13 @@
                         self.syntax_error("`;' missing in entityref")
                         i = i-1
                     name = res.group('name')
+                    if self.__map_case:
+                        name = string.lower(name)
                     if self.entitydefs.has_key(name):
                         self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]
                         n = len(rawdata)
                         i = res.start(0)
                     else:
-                        self.syntax_error("reference to unknown entity `&%s;'" % name)
                         self.unknown_entityref(name)
                     self.lineno = self.lineno + string.count(res.group(0), '\n')
                     continue
@@ -363,7 +378,7 @@
         if end and i < n:
             data = rawdata[i]
             self.syntax_error("bogus `%s'" % data)
-            if illegal.search(data):
+            if not self.__accept_utf8 and illegal.search(data):
                 self.syntax_error('illegal character in content')
             self.handle_data(data)
             self.lineno = self.lineno + string.count(data, '\n')
@@ -390,7 +405,8 @@
             self.syntax_error("`--' inside comment")
         if rawdata[res.start(0)-1] == '-':
             self.syntax_error('comment cannot end in three dashes')
-        if illegal.search(rawdata, i+4, res.start(0)):
+        if not self.__accept_utf8 and \
+           illegal.search(rawdata, i+4, res.start(0)):
             self.syntax_error('illegal character in comment')
         self.handle_comment(rawdata[i+4: res.start(0)])
         return res.end(0)
@@ -400,6 +416,8 @@
         rawdata = self.rawdata
         n = len(rawdata)
         name = res.group('name')
+        if self.__map_case:
+            name = string.lower(name)
         pubid, syslit = res.group('pubid', 'syslit')
         if pubid is not None:
             pubid = pubid[1:-1]         # remove quotes
@@ -449,7 +467,8 @@
         res = cdataclose.search(rawdata, i+9)
         if res is None:
             return -1
-        if illegal.search(rawdata, i+9, res.start(0)):
+        if not self.__accept_utf8 and \
+           illegal.search(rawdata, i+9, res.start(0)):
             self.syntax_error('illegal character in CDATA')
         if not self.stack:
             self.syntax_error('CDATA not in content')
@@ -464,13 +483,15 @@
         if end is None:
             return -1
         j = end.start(0)
-        if illegal.search(rawdata, i+2, j):
+        if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):
             self.syntax_error('illegal character in processing instruction')
         res = tagfind.match(rawdata, i+2)
         if res is None:
             raise RuntimeError, 'unexpected call to parse_proc'
         k = res.end(0)
         name = res.group(0)
+        if self.__map_case:
+            name = string.lower(name)
         if name == 'xml:namespace':
             self.syntax_error('old-fashioned namespace declaration')
             self.__use_namespaces = -1
@@ -510,6 +531,8 @@
             if res is None:
                 break
             attrname, attrvalue = res.group('name', 'value')
+            if self.__map_case:
+                attrname = string.lower(attrname)
             i = res.end(0)
             if attrvalue is None:
                 self.syntax_error("no value specified for attribute `%s'" % attrname)
@@ -517,7 +540,7 @@
             elif attrvalue[:1] == "'" == attrvalue[-1:] or \
                  attrvalue[:1] == '"' == attrvalue[-1:]:
                 attrvalue = attrvalue[1:-1]
-            else:
+            elif not self.__accept_unquoted_attributes:
                 self.syntax_error("attribute `%s' value not quoted" % attrname)
             res = xmlns.match(attrname)
             if res is not None:
@@ -547,6 +570,8 @@
             self.syntax_error('garbage in starttag')
             return end.end(0)
         nstag = tagname = tag.group('tagname')
+        if self.__map_case:
+            nstag = tagname = string.lower(nstag)
         if not self.__seen_starttag and self.__seen_doctype and \
            tagname != self.__seen_doctype:
             self.syntax_error('starttag does not match DOCTYPE')
@@ -581,6 +606,8 @@
                 res = qname.match(key)
                 if res is not None:
                     aprefix, key = res.group('prefix', 'local')
+                    if self.__map_case:
+                        key = string.lower(key)
                     if aprefix is None:
                         aprefix = ''
                     ans = None
@@ -622,11 +649,14 @@
             if self.literal:
                 self.handle_data(rawdata[i])
                 return i+1
-            self.syntax_error('no name specified in end tag')
-            tag = ''
+            if not self.__accept_missing_endtag_name:
+                self.syntax_error('no name specified in end tag')
+                tag = self.stack[-1][0]
             k = i+2
         else:
             tag = res.group(0)
+            if self.__map_case:
+                tag = string.lower(tag)
             if self.literal:
                 if not self.stack or tag != self.stack[-1][0]:
                     self.handle_data(rawdata[i])
@@ -718,15 +748,6 @@
                   'apos': '&#39;',
                   }
 
-    # Example -- handle entity reference, no need to override
-    def handle_entityref(self, name):
-        table = self.entitydefs
-        if table.has_key(name):
-            self.handle_data(table[name])
-        else:
-            self.unknown_entityref(name)
-            return
-
     # Example -- handle data, should be overridden
     def handle_data(self, data):
         pass
@@ -751,14 +772,15 @@
     def unknown_starttag(self, tag, attrs): pass
     def unknown_endtag(self, tag): pass
     def unknown_charref(self, ref): pass
-    def unknown_entityref(self, ref): pass
+    def unknown_entityref(self, name):
+        self.syntax_error("reference to unknown entity `&%s;'" % name)
 
 
 class TestXMLParser(XMLParser):
 
-    def __init__(self):
+    def __init__(self, **kw):
         self.testdata = ""
-        XMLParser.__init__(self)
+        apply(XMLParser.__init__, (self,), kw)
 
     def handle_xml(self, encoding, standalone):
         self.flush()
@@ -768,10 +790,6 @@
         self.flush()
         print 'DOCTYPE:',tag, `data`
 
-    def handle_entity(self, name, strval, pubid, syslit, ndata):
-        self.flush()
-        print 'ENTITY:',`data`
-
     def handle_data(self, data):
         self.testdata = self.testdata + data
         if len(`self.testdata`) >= 70:
commit	b35d6846d9ae6b1d23234b15cb5d8240eeb6c7d9	[log] [tgz]
author	Guido van Rossum <guido@python.org>	Thu Aug 26 15:52:33 1999 +0000
committer	Guido van Rossum <guido@python.org>	Thu Aug 26 15:52:33 1999 +0000
tree	8cc8bb8d026e9849fb4d0631a9e1a53180496287
parent	3601e88cb3a41f805216e9b9feda591f678f4014 [diff] [blame]