blob: 1abcd9a0c4aa8bf03359d0f00ea0b84489d235b2 [file] [log] [blame]
Fred Drakea12adfe2000-09-18 17:40:22 +00001"""\
2A library of useful helper classes to the SAX classes, for the
Fred Drake45cd9de2000-06-29 19:34:54 +00003convenience of application and driver writers.
Fred Drake45cd9de2000-06-29 19:34:54 +00004"""
5
Martin v. Löwis58af43f2000-09-24 21:31:06 +00006import os, urlparse, urllib, types
Serhiy Storchakaf8980382013-02-10 14:26:08 +02007import io
Serhiy Storchaka8673ab92013-02-02 10:28:30 +02008import sys
Fred Drake45cd9de2000-06-29 19:34:54 +00009import handler
Lars Gustäbelfc643c32000-09-24 10:53:31 +000010import xmlreader
Fred Drakea12adfe2000-09-18 17:40:22 +000011
Fred Drake95b4ec52000-12-16 01:45:11 +000012try:
13 _StringTypes = [types.StringType, types.UnicodeType]
14except AttributeError:
15 _StringTypes = [types.StringType]
16
Martin v. Löwis74b51ac2002-10-26 14:50:45 +000017def __dict_replace(s, d):
18 """Replace substrings of a string using a dictionary."""
19 for key, value in d.items():
20 s = s.replace(key, value)
21 return s
Martin v. Löwis58af43f2000-09-24 21:31:06 +000022
Fred Drakea12adfe2000-09-18 17:40:22 +000023def escape(data, entities={}):
Fred Drake45cd9de2000-06-29 19:34:54 +000024 """Escape &, <, and > in a string of data.
Tim Peters0eadaac2003-04-24 16:02:54 +000025
Fred Drake16f63292000-10-23 18:09:50 +000026 You can escape other strings of data by passing a dictionary as
Fred Drake45cd9de2000-06-29 19:34:54 +000027 the optional entities parameter. The keys and values must all be
28 strings; each key will be replaced with its corresponding value.
29 """
Martin v. Löwis74b51ac2002-10-26 14:50:45 +000030
31 # must do ampersand first
Fred Drakea12adfe2000-09-18 17:40:22 +000032 data = data.replace("&", "&amp;")
Fred Drakef55222d2002-10-28 17:29:01 +000033 data = data.replace(">", "&gt;")
34 data = data.replace("<", "&lt;")
35 if entities:
36 data = __dict_replace(data, entities)
37 return data
Martin v. Löwis74b51ac2002-10-26 14:50:45 +000038
39def unescape(data, entities={}):
40 """Unescape &amp;, &lt;, and &gt; in a string of data.
41
42 You can unescape other strings of data by passing a dictionary as
43 the optional entities parameter. The keys and values must all be
44 strings; each key will be replaced with its corresponding value.
45 """
Fred Drakef55222d2002-10-28 17:29:01 +000046 data = data.replace("&lt;", "<")
47 data = data.replace("&gt;", ">")
Fred Drakef55222d2002-10-28 17:29:01 +000048 if entities:
49 data = __dict_replace(data, entities)
Fred Drake407fea52002-10-28 17:46:59 +000050 # must do ampersand last
Fred Drake6d890502002-10-28 18:09:41 +000051 return data.replace("&amp;", "&")
Fred Drake45cd9de2000-06-29 19:34:54 +000052
Fred Drakeacd32d32001-07-19 16:10:15 +000053def quoteattr(data, entities={}):
54 """Escape and quote an attribute value.
55
56 Escape &, <, and > in a string of data, then quote it for use as
57 an attribute value. The \" character will be escaped as well, if
58 necessary.
59
60 You can escape other strings of data by passing a dictionary as
61 the optional entities parameter. The keys and values must all be
62 strings; each key will be replaced with its corresponding value.
63 """
Andrew M. Kuchling91c64a02006-06-09 13:15:57 +000064 entities = entities.copy()
65 entities.update({'\n': '&#10;', '\r': '&#13;', '\t':'&#9;'})
Fred Drakeacd32d32001-07-19 16:10:15 +000066 data = escape(data, entities)
67 if '"' in data:
68 if "'" in data:
69 data = '"%s"' % data.replace('"', "&quot;")
70 else:
71 data = "'%s'" % data
72 else:
73 data = '"%s"' % data
74 return data
75
Fred Drakea12adfe2000-09-18 17:40:22 +000076
Serhiy Storchakaf8980382013-02-10 14:26:08 +020077def _gettextwriter(out, encoding):
78 if out is None:
79 import sys
80 out = sys.stdout
81
82 if isinstance(out, io.RawIOBase):
83 buffer = io.BufferedIOBase(out)
84 # Keep the original file open when the TextIOWrapper is
85 # destroyed
86 buffer.close = lambda: None
87 else:
88 # This is to handle passed objects that aren't in the
89 # IOBase hierarchy, but just have a write method
90 buffer = io.BufferedIOBase()
91 buffer.writable = lambda: True
92 buffer.write = out.write
93 try:
94 # TextIOWrapper uses this methods to determine
95 # if BOM (for UTF-16, etc) should be added
96 buffer.seekable = out.seekable
97 buffer.tell = out.tell
98 except AttributeError:
99 pass
100 # wrap a binary writer with TextIOWrapper
Serhiy Storchaka93bfe7d2013-02-25 13:31:29 +0200101 class UnbufferedTextIOWrapper(io.TextIOWrapper):
102 def write(self, s):
103 super(UnbufferedTextIOWrapper, self).write(s)
104 self.flush()
105 return UnbufferedTextIOWrapper(buffer, encoding=encoding,
106 errors='xmlcharrefreplace',
107 newline='\n')
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200108
Fred Drake45cd9de2000-06-29 19:34:54 +0000109class XMLGenerator(handler.ContentHandler):
110
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000111 def __init__(self, out=None, encoding="iso-8859-1"):
Fred Drake45cd9de2000-06-29 19:34:54 +0000112 handler.ContentHandler.__init__(self)
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200113 out = _gettextwriter(out, encoding)
114 self._write = out.write
115 self._flush = out.flush
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000116 self._ns_contexts = [{}] # contains uri -> prefix dicts
117 self._current_context = self._ns_contexts[-1]
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000118 self._undeclared_ns_maps = []
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000119 self._encoding = encoding
Fred Drake45cd9de2000-06-29 19:34:54 +0000120
Martin v. Löwis2bad58f2007-02-12 12:21:10 +0000121 def _qname(self, name):
122 """Builds a qualified name from a (ns_url, localname) pair"""
123 if name[0]:
Antoine Pitrou7f081022010-10-27 18:43:21 +0000124 # Per http://www.w3.org/XML/1998/namespace, The 'xml' prefix is
125 # bound by definition to http://www.w3.org/XML/1998/namespace. It
126 # does not need to be declared and will not usually be found in
127 # self._current_context.
128 if 'http://www.w3.org/XML/1998/namespace' == name[0]:
129 return 'xml:' + name[1]
Martin v. Löwis2bad58f2007-02-12 12:21:10 +0000130 # The name is in a non-empty namespace
131 prefix = self._current_context[name[0]]
132 if prefix:
133 # If it is not the default namespace, prepend the prefix
134 return prefix + ":" + name[1]
135 # Return the unqualified name
136 return name[1]
137
Fred Drake45cd9de2000-06-29 19:34:54 +0000138 # ContentHandler methods
Fred Drakea12adfe2000-09-18 17:40:22 +0000139
Fred Drake45cd9de2000-06-29 19:34:54 +0000140 def startDocument(self):
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200141 self._write(u'<?xml version="1.0" encoding="%s"?>\n' %
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000142 self._encoding)
Fred Drake45cd9de2000-06-29 19:34:54 +0000143
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200144 def endDocument(self):
145 self._flush()
146
Fred Drake45cd9de2000-06-29 19:34:54 +0000147 def startPrefixMapping(self, prefix, uri):
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000148 self._ns_contexts.append(self._current_context.copy())
149 self._current_context[uri] = prefix
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000150 self._undeclared_ns_maps.append((prefix, uri))
Fred Drake45cd9de2000-06-29 19:34:54 +0000151
152 def endPrefixMapping(self, prefix):
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000153 self._current_context = self._ns_contexts[-1]
154 del self._ns_contexts[-1]
Fred Drake45cd9de2000-06-29 19:34:54 +0000155
156 def startElement(self, name, attrs):
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200157 self._write(u'<' + name)
Fred Drake45cd9de2000-06-29 19:34:54 +0000158 for (name, value) in attrs.items():
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200159 self._write(u' %s=%s' % (name, quoteattr(value)))
160 self._write(u'>')
Fred Drake16f63292000-10-23 18:09:50 +0000161
Fred Drake45cd9de2000-06-29 19:34:54 +0000162 def endElement(self, name):
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200163 self._write(u'</%s>' % name)
Fred Drake45cd9de2000-06-29 19:34:54 +0000164
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000165 def startElementNS(self, name, qname, attrs):
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200166 self._write(u'<' + self._qname(name))
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000167
Martin v. Löwis2bad58f2007-02-12 12:21:10 +0000168 for prefix, uri in self._undeclared_ns_maps:
169 if prefix:
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200170 self._write(u' xmlns:%s="%s"' % (prefix, uri))
Martin v. Löwis2bad58f2007-02-12 12:21:10 +0000171 else:
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200172 self._write(u' xmlns="%s"' % uri)
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000173 self._undeclared_ns_maps = []
Fred Drake16f63292000-10-23 18:09:50 +0000174
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000175 for (name, value) in attrs.items():
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200176 self._write(u' %s=%s' % (self._qname(name), quoteattr(value)))
177 self._write(u'>')
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000178
179 def endElementNS(self, name, qname):
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200180 self._write(u'</%s>' % self._qname(name))
Fred Drake16f63292000-10-23 18:09:50 +0000181
Fred Drake45cd9de2000-06-29 19:34:54 +0000182 def characters(self, content):
Serhiy Storchaka74239032013-05-12 17:29:34 +0300183 if not isinstance(content, unicode):
184 content = unicode(content, self._encoding)
185 self._write(escape(content))
Fred Drake45cd9de2000-06-29 19:34:54 +0000186
187 def ignorableWhitespace(self, content):
Serhiy Storchaka74239032013-05-12 17:29:34 +0300188 if not isinstance(content, unicode):
189 content = unicode(content, self._encoding)
190 self._write(content)
Fred Drakea12adfe2000-09-18 17:40:22 +0000191
Fred Drake45cd9de2000-06-29 19:34:54 +0000192 def processingInstruction(self, target, data):
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200193 self._write(u'<?%s %s?>' % (target, data))
Fred Drake45cd9de2000-06-29 19:34:54 +0000194
Fred Drakea12adfe2000-09-18 17:40:22 +0000195
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000196class XMLFilterBase(xmlreader.XMLReader):
Fred Drake45cd9de2000-06-29 19:34:54 +0000197 """This class is designed to sit between an XMLReader and the
198 client application's event handlers. By default, it does nothing
199 but pass requests up to the reader and events on to the handlers
200 unmodified, but subclasses can override specific methods to modify
201 the event stream or the configuration requests as they pass
202 through."""
203
Lars Gustäbelbc1b5c82000-10-11 22:35:00 +0000204 def __init__(self, parent = None):
205 xmlreader.XMLReader.__init__(self)
206 self._parent = parent
Fred Drake16f63292000-10-23 18:09:50 +0000207
Fred Drake45cd9de2000-06-29 19:34:54 +0000208 # ErrorHandler methods
209
210 def error(self, exception):
211 self._err_handler.error(exception)
212
213 def fatalError(self, exception):
214 self._err_handler.fatalError(exception)
215
216 def warning(self, exception):
217 self._err_handler.warning(exception)
218
219 # ContentHandler methods
Fred Drakea12adfe2000-09-18 17:40:22 +0000220
Fred Drake45cd9de2000-06-29 19:34:54 +0000221 def setDocumentLocator(self, locator):
222 self._cont_handler.setDocumentLocator(locator)
Fred Drakea12adfe2000-09-18 17:40:22 +0000223
Fred Drake45cd9de2000-06-29 19:34:54 +0000224 def startDocument(self):
225 self._cont_handler.startDocument()
226
227 def endDocument(self):
228 self._cont_handler.endDocument()
229
230 def startPrefixMapping(self, prefix, uri):
231 self._cont_handler.startPrefixMapping(prefix, uri)
232
233 def endPrefixMapping(self, prefix):
234 self._cont_handler.endPrefixMapping(prefix)
235
236 def startElement(self, name, attrs):
237 self._cont_handler.startElement(name, attrs)
238
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000239 def endElement(self, name):
240 self._cont_handler.endElement(name)
241
242 def startElementNS(self, name, qname, attrs):
Martin v. Löwis0ea558f2004-05-06 02:04:21 +0000243 self._cont_handler.startElementNS(name, qname, attrs)
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000244
245 def endElementNS(self, name, qname):
246 self._cont_handler.endElementNS(name, qname)
Fred Drake45cd9de2000-06-29 19:34:54 +0000247
248 def characters(self, content):
249 self._cont_handler.characters(content)
250
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000251 def ignorableWhitespace(self, chars):
252 self._cont_handler.ignorableWhitespace(chars)
Fred Drake45cd9de2000-06-29 19:34:54 +0000253
254 def processingInstruction(self, target, data):
255 self._cont_handler.processingInstruction(target, data)
256
257 def skippedEntity(self, name):
258 self._cont_handler.skippedEntity(name)
259
260 # DTDHandler methods
261
262 def notationDecl(self, name, publicId, systemId):
263 self._dtd_handler.notationDecl(name, publicId, systemId)
264
265 def unparsedEntityDecl(self, name, publicId, systemId, ndata):
266 self._dtd_handler.unparsedEntityDecl(name, publicId, systemId, ndata)
267
268 # EntityResolver methods
269
270 def resolveEntity(self, publicId, systemId):
Fred Drakee4772f32005-02-03 17:31:39 +0000271 return self._ent_handler.resolveEntity(publicId, systemId)
Fred Drake45cd9de2000-06-29 19:34:54 +0000272
273 # XMLReader methods
274
275 def parse(self, source):
276 self._parent.setContentHandler(self)
277 self._parent.setErrorHandler(self)
278 self._parent.setEntityResolver(self)
279 self._parent.setDTDHandler(self)
280 self._parent.parse(source)
281
282 def setLocale(self, locale):
283 self._parent.setLocale(locale)
Fred Drakea12adfe2000-09-18 17:40:22 +0000284
Fred Drake45cd9de2000-06-29 19:34:54 +0000285 def getFeature(self, name):
286 return self._parent.getFeature(name)
287
288 def setFeature(self, name, state):
289 self._parent.setFeature(name, state)
290
291 def getProperty(self, name):
292 return self._parent.getProperty(name)
293
294 def setProperty(self, name, value):
295 self._parent.setProperty(name, value)
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000296
Lars Gustäbelbc1b5c82000-10-11 22:35:00 +0000297 # XMLFilter methods
298
299 def getParent(self):
300 return self._parent
301
302 def setParent(self, parent):
303 self._parent = parent
304
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000305# --- Utility functions
306
307def prepare_input_source(source, base = ""):
308 """This function takes an InputSource and an optional base URL and
309 returns a fully resolved InputSource object ready for reading."""
Fred Drake16f63292000-10-23 18:09:50 +0000310
Martin v. Löwis58af43f2000-09-24 21:31:06 +0000311 if type(source) in _StringTypes:
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000312 source = xmlreader.InputSource(source)
Martin v. Löwis58af43f2000-09-24 21:31:06 +0000313 elif hasattr(source, "read"):
314 f = source
Martin v. Löwis5fece7f2000-10-06 21:11:20 +0000315 source = xmlreader.InputSource()
Martin v. Löwis58af43f2000-09-24 21:31:06 +0000316 source.setByteStream(f)
Martin v. Löwis5fece7f2000-10-06 21:11:20 +0000317 if hasattr(f, "name"):
Lars Gustäbel4ced5e72000-10-24 15:53:12 +0000318 source.setSystemId(f.name)
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000319
Fred Drake0872e052000-09-26 17:23:09 +0000320 if source.getByteStream() is None:
Serhiy Storchaka8673ab92013-02-02 10:28:30 +0200321 try:
322 sysid = source.getSystemId()
323 basehead = os.path.dirname(os.path.normpath(base))
324 encoding = sys.getfilesystemencoding()
325 if isinstance(sysid, unicode):
326 if not isinstance(basehead, unicode):
327 try:
328 basehead = basehead.decode(encoding)
329 except UnicodeDecodeError:
330 sysid = sysid.encode(encoding)
331 else:
332 if isinstance(basehead, unicode):
333 try:
334 sysid = sysid.decode(encoding)
335 except UnicodeDecodeError:
336 basehead = basehead.encode(encoding)
337 sysidfilename = os.path.join(basehead, sysid)
338 isfile = os.path.isfile(sysidfilename)
339 except UnicodeError:
340 isfile = False
341 if isfile:
Raymond Hettinger06d9b1f2004-10-20 08:21:19 +0000342 source.setSystemId(sysidfilename)
343 f = open(sysidfilename, "rb")
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000344 else:
Serhiy Storchaka8673ab92013-02-02 10:28:30 +0200345 source.setSystemId(urlparse.urljoin(base, source.getSystemId()))
Fred Drake0872e052000-09-26 17:23:09 +0000346 f = urllib.urlopen(source.getSystemId())
Fred Drake16f63292000-10-23 18:09:50 +0000347
Fred Drake0872e052000-09-26 17:23:09 +0000348 source.setByteStream(f)
Fred Drake16f63292000-10-23 18:09:50 +0000349
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000350 return source