blob: 1b89e31aed8f3a6d74a11e25c6ddf59d65704703 [file] [log] [blame]
Fred Drakea12adfe2000-09-18 17:40:22 +00001"""\
2A library of useful helper classes to the SAX classes, for the
Fred Drake45cd9de2000-06-29 19:34:54 +00003convenience of application and driver writers.
Fred Drake45cd9de2000-06-29 19:34:54 +00004"""
5
Martin v. Löwis58af43f2000-09-24 21:31:06 +00006import os, urlparse, urllib, types
Serhiy Storchakaf8980382013-02-10 14:26:08 +02007import io
Serhiy Storchaka8673ab92013-02-02 10:28:30 +02008import sys
Fred Drake45cd9de2000-06-29 19:34:54 +00009import handler
Lars Gustäbelfc643c32000-09-24 10:53:31 +000010import xmlreader
Fred Drakea12adfe2000-09-18 17:40:22 +000011
Fred Drake95b4ec52000-12-16 01:45:11 +000012try:
13 _StringTypes = [types.StringType, types.UnicodeType]
14except AttributeError:
15 _StringTypes = [types.StringType]
16
Martin v. Löwis74b51ac2002-10-26 14:50:45 +000017def __dict_replace(s, d):
18 """Replace substrings of a string using a dictionary."""
19 for key, value in d.items():
20 s = s.replace(key, value)
21 return s
Martin v. Löwis58af43f2000-09-24 21:31:06 +000022
Fred Drakea12adfe2000-09-18 17:40:22 +000023def escape(data, entities={}):
Fred Drake45cd9de2000-06-29 19:34:54 +000024 """Escape &, <, and > in a string of data.
Tim Peters0eadaac2003-04-24 16:02:54 +000025
Fred Drake16f63292000-10-23 18:09:50 +000026 You can escape other strings of data by passing a dictionary as
Fred Drake45cd9de2000-06-29 19:34:54 +000027 the optional entities parameter. The keys and values must all be
28 strings; each key will be replaced with its corresponding value.
29 """
Martin v. Löwis74b51ac2002-10-26 14:50:45 +000030
31 # must do ampersand first
Fred Drakea12adfe2000-09-18 17:40:22 +000032 data = data.replace("&", "&amp;")
Fred Drakef55222d2002-10-28 17:29:01 +000033 data = data.replace(">", "&gt;")
34 data = data.replace("<", "&lt;")
35 if entities:
36 data = __dict_replace(data, entities)
37 return data
Martin v. Löwis74b51ac2002-10-26 14:50:45 +000038
39def unescape(data, entities={}):
40 """Unescape &amp;, &lt;, and &gt; in a string of data.
41
42 You can unescape other strings of data by passing a dictionary as
43 the optional entities parameter. The keys and values must all be
44 strings; each key will be replaced with its corresponding value.
45 """
Fred Drakef55222d2002-10-28 17:29:01 +000046 data = data.replace("&lt;", "<")
47 data = data.replace("&gt;", ">")
Fred Drakef55222d2002-10-28 17:29:01 +000048 if entities:
49 data = __dict_replace(data, entities)
Fred Drake407fea52002-10-28 17:46:59 +000050 # must do ampersand last
Fred Drake6d890502002-10-28 18:09:41 +000051 return data.replace("&amp;", "&")
Fred Drake45cd9de2000-06-29 19:34:54 +000052
Fred Drakeacd32d32001-07-19 16:10:15 +000053def quoteattr(data, entities={}):
54 """Escape and quote an attribute value.
55
56 Escape &, <, and > in a string of data, then quote it for use as
57 an attribute value. The \" character will be escaped as well, if
58 necessary.
59
60 You can escape other strings of data by passing a dictionary as
61 the optional entities parameter. The keys and values must all be
62 strings; each key will be replaced with its corresponding value.
63 """
Andrew M. Kuchling91c64a02006-06-09 13:15:57 +000064 entities = entities.copy()
65 entities.update({'\n': '&#10;', '\r': '&#13;', '\t':'&#9;'})
Fred Drakeacd32d32001-07-19 16:10:15 +000066 data = escape(data, entities)
67 if '"' in data:
68 if "'" in data:
69 data = '"%s"' % data.replace('"', "&quot;")
70 else:
71 data = "'%s'" % data
72 else:
73 data = '"%s"' % data
74 return data
75
Fred Drakea12adfe2000-09-18 17:40:22 +000076
Serhiy Storchakaf8980382013-02-10 14:26:08 +020077def _gettextwriter(out, encoding):
78 if out is None:
79 import sys
80 out = sys.stdout
81
82 if isinstance(out, io.RawIOBase):
83 buffer = io.BufferedIOBase(out)
84 # Keep the original file open when the TextIOWrapper is
85 # destroyed
86 buffer.close = lambda: None
87 else:
88 # This is to handle passed objects that aren't in the
89 # IOBase hierarchy, but just have a write method
90 buffer = io.BufferedIOBase()
91 buffer.writable = lambda: True
92 buffer.write = out.write
93 try:
94 # TextIOWrapper uses this methods to determine
95 # if BOM (for UTF-16, etc) should be added
96 buffer.seekable = out.seekable
97 buffer.tell = out.tell
98 except AttributeError:
99 pass
100 # wrap a binary writer with TextIOWrapper
Raymond Hettinger1b5f58d2014-07-25 10:26:36 -0700101 return _UnbufferedTextIOWrapper(buffer, encoding=encoding,
Serhiy Storchaka93bfe7d2013-02-25 13:31:29 +0200102 errors='xmlcharrefreplace',
103 newline='\n')
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200104
Raymond Hettinger1b5f58d2014-07-25 10:26:36 -0700105
106class _UnbufferedTextIOWrapper(io.TextIOWrapper):
107 def write(self, s):
108 super(_UnbufferedTextIOWrapper, self).write(s)
109 self.flush()
110
111
Fred Drake45cd9de2000-06-29 19:34:54 +0000112class XMLGenerator(handler.ContentHandler):
113
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000114 def __init__(self, out=None, encoding="iso-8859-1"):
Fred Drake45cd9de2000-06-29 19:34:54 +0000115 handler.ContentHandler.__init__(self)
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200116 out = _gettextwriter(out, encoding)
117 self._write = out.write
118 self._flush = out.flush
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000119 self._ns_contexts = [{}] # contains uri -> prefix dicts
120 self._current_context = self._ns_contexts[-1]
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000121 self._undeclared_ns_maps = []
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000122 self._encoding = encoding
Fred Drake45cd9de2000-06-29 19:34:54 +0000123
Martin v. Löwis2bad58f2007-02-12 12:21:10 +0000124 def _qname(self, name):
125 """Builds a qualified name from a (ns_url, localname) pair"""
126 if name[0]:
Antoine Pitrou7f081022010-10-27 18:43:21 +0000127 # Per http://www.w3.org/XML/1998/namespace, The 'xml' prefix is
128 # bound by definition to http://www.w3.org/XML/1998/namespace. It
129 # does not need to be declared and will not usually be found in
130 # self._current_context.
131 if 'http://www.w3.org/XML/1998/namespace' == name[0]:
132 return 'xml:' + name[1]
Martin v. Löwis2bad58f2007-02-12 12:21:10 +0000133 # The name is in a non-empty namespace
134 prefix = self._current_context[name[0]]
135 if prefix:
136 # If it is not the default namespace, prepend the prefix
137 return prefix + ":" + name[1]
138 # Return the unqualified name
139 return name[1]
140
Fred Drake45cd9de2000-06-29 19:34:54 +0000141 # ContentHandler methods
Fred Drakea12adfe2000-09-18 17:40:22 +0000142
Fred Drake45cd9de2000-06-29 19:34:54 +0000143 def startDocument(self):
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200144 self._write(u'<?xml version="1.0" encoding="%s"?>\n' %
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000145 self._encoding)
Fred Drake45cd9de2000-06-29 19:34:54 +0000146
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200147 def endDocument(self):
148 self._flush()
149
Fred Drake45cd9de2000-06-29 19:34:54 +0000150 def startPrefixMapping(self, prefix, uri):
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000151 self._ns_contexts.append(self._current_context.copy())
152 self._current_context[uri] = prefix
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000153 self._undeclared_ns_maps.append((prefix, uri))
Fred Drake45cd9de2000-06-29 19:34:54 +0000154
155 def endPrefixMapping(self, prefix):
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000156 self._current_context = self._ns_contexts[-1]
157 del self._ns_contexts[-1]
Fred Drake45cd9de2000-06-29 19:34:54 +0000158
159 def startElement(self, name, attrs):
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200160 self._write(u'<' + name)
Fred Drake45cd9de2000-06-29 19:34:54 +0000161 for (name, value) in attrs.items():
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200162 self._write(u' %s=%s' % (name, quoteattr(value)))
163 self._write(u'>')
Fred Drake16f63292000-10-23 18:09:50 +0000164
Fred Drake45cd9de2000-06-29 19:34:54 +0000165 def endElement(self, name):
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200166 self._write(u'</%s>' % name)
Fred Drake45cd9de2000-06-29 19:34:54 +0000167
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000168 def startElementNS(self, name, qname, attrs):
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200169 self._write(u'<' + self._qname(name))
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000170
Martin v. Löwis2bad58f2007-02-12 12:21:10 +0000171 for prefix, uri in self._undeclared_ns_maps:
172 if prefix:
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200173 self._write(u' xmlns:%s="%s"' % (prefix, uri))
Martin v. Löwis2bad58f2007-02-12 12:21:10 +0000174 else:
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200175 self._write(u' xmlns="%s"' % uri)
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000176 self._undeclared_ns_maps = []
Fred Drake16f63292000-10-23 18:09:50 +0000177
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000178 for (name, value) in attrs.items():
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200179 self._write(u' %s=%s' % (self._qname(name), quoteattr(value)))
180 self._write(u'>')
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000181
182 def endElementNS(self, name, qname):
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200183 self._write(u'</%s>' % self._qname(name))
Fred Drake16f63292000-10-23 18:09:50 +0000184
Fred Drake45cd9de2000-06-29 19:34:54 +0000185 def characters(self, content):
Serhiy Storchaka74239032013-05-12 17:29:34 +0300186 if not isinstance(content, unicode):
187 content = unicode(content, self._encoding)
188 self._write(escape(content))
Fred Drake45cd9de2000-06-29 19:34:54 +0000189
190 def ignorableWhitespace(self, content):
Serhiy Storchaka74239032013-05-12 17:29:34 +0300191 if not isinstance(content, unicode):
192 content = unicode(content, self._encoding)
193 self._write(content)
Fred Drakea12adfe2000-09-18 17:40:22 +0000194
Fred Drake45cd9de2000-06-29 19:34:54 +0000195 def processingInstruction(self, target, data):
Serhiy Storchakaf8980382013-02-10 14:26:08 +0200196 self._write(u'<?%s %s?>' % (target, data))
Fred Drake45cd9de2000-06-29 19:34:54 +0000197
Fred Drakea12adfe2000-09-18 17:40:22 +0000198
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000199class XMLFilterBase(xmlreader.XMLReader):
Fred Drake45cd9de2000-06-29 19:34:54 +0000200 """This class is designed to sit between an XMLReader and the
201 client application's event handlers. By default, it does nothing
202 but pass requests up to the reader and events on to the handlers
203 unmodified, but subclasses can override specific methods to modify
204 the event stream or the configuration requests as they pass
205 through."""
206
Lars Gustäbelbc1b5c82000-10-11 22:35:00 +0000207 def __init__(self, parent = None):
208 xmlreader.XMLReader.__init__(self)
209 self._parent = parent
Fred Drake16f63292000-10-23 18:09:50 +0000210
Fred Drake45cd9de2000-06-29 19:34:54 +0000211 # ErrorHandler methods
212
213 def error(self, exception):
214 self._err_handler.error(exception)
215
216 def fatalError(self, exception):
217 self._err_handler.fatalError(exception)
218
219 def warning(self, exception):
220 self._err_handler.warning(exception)
221
222 # ContentHandler methods
Fred Drakea12adfe2000-09-18 17:40:22 +0000223
Fred Drake45cd9de2000-06-29 19:34:54 +0000224 def setDocumentLocator(self, locator):
225 self._cont_handler.setDocumentLocator(locator)
Fred Drakea12adfe2000-09-18 17:40:22 +0000226
Fred Drake45cd9de2000-06-29 19:34:54 +0000227 def startDocument(self):
228 self._cont_handler.startDocument()
229
230 def endDocument(self):
231 self._cont_handler.endDocument()
232
233 def startPrefixMapping(self, prefix, uri):
234 self._cont_handler.startPrefixMapping(prefix, uri)
235
236 def endPrefixMapping(self, prefix):
237 self._cont_handler.endPrefixMapping(prefix)
238
239 def startElement(self, name, attrs):
240 self._cont_handler.startElement(name, attrs)
241
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000242 def endElement(self, name):
243 self._cont_handler.endElement(name)
244
245 def startElementNS(self, name, qname, attrs):
Martin v. Löwis0ea558f2004-05-06 02:04:21 +0000246 self._cont_handler.startElementNS(name, qname, attrs)
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000247
248 def endElementNS(self, name, qname):
249 self._cont_handler.endElementNS(name, qname)
Fred Drake45cd9de2000-06-29 19:34:54 +0000250
251 def characters(self, content):
252 self._cont_handler.characters(content)
253
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000254 def ignorableWhitespace(self, chars):
255 self._cont_handler.ignorableWhitespace(chars)
Fred Drake45cd9de2000-06-29 19:34:54 +0000256
257 def processingInstruction(self, target, data):
258 self._cont_handler.processingInstruction(target, data)
259
260 def skippedEntity(self, name):
261 self._cont_handler.skippedEntity(name)
262
263 # DTDHandler methods
264
265 def notationDecl(self, name, publicId, systemId):
266 self._dtd_handler.notationDecl(name, publicId, systemId)
267
268 def unparsedEntityDecl(self, name, publicId, systemId, ndata):
269 self._dtd_handler.unparsedEntityDecl(name, publicId, systemId, ndata)
270
271 # EntityResolver methods
272
273 def resolveEntity(self, publicId, systemId):
Fred Drakee4772f32005-02-03 17:31:39 +0000274 return self._ent_handler.resolveEntity(publicId, systemId)
Fred Drake45cd9de2000-06-29 19:34:54 +0000275
276 # XMLReader methods
277
278 def parse(self, source):
279 self._parent.setContentHandler(self)
280 self._parent.setErrorHandler(self)
281 self._parent.setEntityResolver(self)
282 self._parent.setDTDHandler(self)
283 self._parent.parse(source)
284
285 def setLocale(self, locale):
286 self._parent.setLocale(locale)
Fred Drakea12adfe2000-09-18 17:40:22 +0000287
Fred Drake45cd9de2000-06-29 19:34:54 +0000288 def getFeature(self, name):
289 return self._parent.getFeature(name)
290
291 def setFeature(self, name, state):
292 self._parent.setFeature(name, state)
293
294 def getProperty(self, name):
295 return self._parent.getProperty(name)
296
297 def setProperty(self, name, value):
298 self._parent.setProperty(name, value)
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000299
Lars Gustäbelbc1b5c82000-10-11 22:35:00 +0000300 # XMLFilter methods
301
302 def getParent(self):
303 return self._parent
304
305 def setParent(self, parent):
306 self._parent = parent
307
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000308# --- Utility functions
309
310def prepare_input_source(source, base = ""):
311 """This function takes an InputSource and an optional base URL and
312 returns a fully resolved InputSource object ready for reading."""
Fred Drake16f63292000-10-23 18:09:50 +0000313
Martin v. Löwis58af43f2000-09-24 21:31:06 +0000314 if type(source) in _StringTypes:
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000315 source = xmlreader.InputSource(source)
Martin v. Löwis58af43f2000-09-24 21:31:06 +0000316 elif hasattr(source, "read"):
317 f = source
Martin v. Löwis5fece7f2000-10-06 21:11:20 +0000318 source = xmlreader.InputSource()
Martin v. Löwis58af43f2000-09-24 21:31:06 +0000319 source.setByteStream(f)
Martin v. Löwis5fece7f2000-10-06 21:11:20 +0000320 if hasattr(f, "name"):
Lars Gustäbel4ced5e72000-10-24 15:53:12 +0000321 source.setSystemId(f.name)
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000322
Fred Drake0872e052000-09-26 17:23:09 +0000323 if source.getByteStream() is None:
Serhiy Storchaka8673ab92013-02-02 10:28:30 +0200324 try:
325 sysid = source.getSystemId()
326 basehead = os.path.dirname(os.path.normpath(base))
327 encoding = sys.getfilesystemencoding()
328 if isinstance(sysid, unicode):
329 if not isinstance(basehead, unicode):
330 try:
331 basehead = basehead.decode(encoding)
332 except UnicodeDecodeError:
333 sysid = sysid.encode(encoding)
334 else:
335 if isinstance(basehead, unicode):
336 try:
337 sysid = sysid.decode(encoding)
338 except UnicodeDecodeError:
339 basehead = basehead.encode(encoding)
340 sysidfilename = os.path.join(basehead, sysid)
341 isfile = os.path.isfile(sysidfilename)
342 except UnicodeError:
343 isfile = False
344 if isfile:
Raymond Hettinger06d9b1f2004-10-20 08:21:19 +0000345 source.setSystemId(sysidfilename)
346 f = open(sysidfilename, "rb")
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000347 else:
Serhiy Storchaka8673ab92013-02-02 10:28:30 +0200348 source.setSystemId(urlparse.urljoin(base, source.getSystemId()))
Fred Drake0872e052000-09-26 17:23:09 +0000349 f = urllib.urlopen(source.getSystemId())
Fred Drake16f63292000-10-23 18:09:50 +0000350
Fred Drake0872e052000-09-26 17:23:09 +0000351 source.setByteStream(f)
Fred Drake16f63292000-10-23 18:09:50 +0000352
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000353 return source