blob: c1612ea1cebc5d064280e7d2d617d0fc2cf1f0f0 [file] [log] [blame]
Fred Drakea12adfe2000-09-18 17:40:22 +00001"""\
2A library of useful helper classes to the SAX classes, for the
Fred Drake45cd9de2000-06-29 19:34:54 +00003convenience of application and driver writers.
Fred Drake45cd9de2000-06-29 19:34:54 +00004"""
5
Jeremy Hylton1afc1692008-06-18 20:49:58 +00006import os, urllib.parse, urllib.request
Serhiy Storchaka88efc522013-02-10 14:29:52 +02007import io
Georg Brandlc502df42013-05-12 11:41:12 +02008import codecs
Guido van Rossum3b271052006-08-17 09:10:09 +00009from . import handler
10from . import xmlreader
Fred Drakea12adfe2000-09-18 17:40:22 +000011
Martin v. Löwis74b51ac2002-10-26 14:50:45 +000012def __dict_replace(s, d):
13 """Replace substrings of a string using a dictionary."""
14 for key, value in d.items():
15 s = s.replace(key, value)
16 return s
Martin v. Löwis58af43f2000-09-24 21:31:06 +000017
Fred Drakea12adfe2000-09-18 17:40:22 +000018def escape(data, entities={}):
Fred Drake45cd9de2000-06-29 19:34:54 +000019 """Escape &, <, and > in a string of data.
Tim Peters0eadaac2003-04-24 16:02:54 +000020
Fred Drake16f63292000-10-23 18:09:50 +000021 You can escape other strings of data by passing a dictionary as
Fred Drake45cd9de2000-06-29 19:34:54 +000022 the optional entities parameter. The keys and values must all be
23 strings; each key will be replaced with its corresponding value.
24 """
Martin v. Löwis74b51ac2002-10-26 14:50:45 +000025
26 # must do ampersand first
Fred Drakea12adfe2000-09-18 17:40:22 +000027 data = data.replace("&", "&amp;")
Fred Drakef55222d2002-10-28 17:29:01 +000028 data = data.replace(">", "&gt;")
29 data = data.replace("<", "&lt;")
30 if entities:
31 data = __dict_replace(data, entities)
32 return data
Martin v. Löwis74b51ac2002-10-26 14:50:45 +000033
34def unescape(data, entities={}):
35 """Unescape &amp;, &lt;, and &gt; in a string of data.
36
37 You can unescape other strings of data by passing a dictionary as
38 the optional entities parameter. The keys and values must all be
39 strings; each key will be replaced with its corresponding value.
40 """
Fred Drakef55222d2002-10-28 17:29:01 +000041 data = data.replace("&lt;", "<")
42 data = data.replace("&gt;", ">")
Fred Drakef55222d2002-10-28 17:29:01 +000043 if entities:
44 data = __dict_replace(data, entities)
Fred Drake407fea52002-10-28 17:46:59 +000045 # must do ampersand last
Fred Drake6d890502002-10-28 18:09:41 +000046 return data.replace("&amp;", "&")
Fred Drake45cd9de2000-06-29 19:34:54 +000047
Fred Drakeacd32d32001-07-19 16:10:15 +000048def quoteattr(data, entities={}):
49 """Escape and quote an attribute value.
50
51 Escape &, <, and > in a string of data, then quote it for use as
52 an attribute value. The \" character will be escaped as well, if
53 necessary.
54
55 You can escape other strings of data by passing a dictionary as
56 the optional entities parameter. The keys and values must all be
57 strings; each key will be replaced with its corresponding value.
58 """
Serhiy Storchakada084702019-03-27 08:02:28 +020059 entities = {**entities, '\n': '&#10;', '\r': '&#13;', '\t':'&#9;'}
Fred Drakeacd32d32001-07-19 16:10:15 +000060 data = escape(data, entities)
61 if '"' in data:
62 if "'" in data:
63 data = '"%s"' % data.replace('"', "&quot;")
64 else:
65 data = "'%s'" % data
66 else:
67 data = '"%s"' % data
68 return data
69
Fred Drakea12adfe2000-09-18 17:40:22 +000070
Serhiy Storchaka88efc522013-02-10 14:29:52 +020071def _gettextwriter(out, encoding):
72 if out is None:
73 import sys
74 return sys.stdout
75
76 if isinstance(out, io.TextIOBase):
77 # use a text writer as is
78 return out
79
Georg Brandlc502df42013-05-12 11:41:12 +020080 if isinstance(out, (codecs.StreamWriter, codecs.StreamReaderWriter)):
81 # use a codecs stream writer as is
82 return out
83
Serhiy Storchaka88efc522013-02-10 14:29:52 +020084 # wrap a binary writer with TextIOWrapper
85 if isinstance(out, io.RawIOBase):
86 # Keep the original file open when the TextIOWrapper is
87 # destroyed
88 class _wrapper:
89 __class__ = out.__class__
90 def __getattr__(self, name):
91 return getattr(out, name)
92 buffer = _wrapper()
93 buffer.close = lambda: None
94 else:
95 # This is to handle passed objects that aren't in the
96 # IOBase hierarchy, but just have a write method
97 buffer = io.BufferedIOBase()
98 buffer.writable = lambda: True
99 buffer.write = out.write
100 try:
101 # TextIOWrapper uses this methods to determine
102 # if BOM (for UTF-16, etc) should be added
103 buffer.seekable = out.seekable
104 buffer.tell = out.tell
105 except AttributeError:
106 pass
107 return io.TextIOWrapper(buffer, encoding=encoding,
108 errors='xmlcharrefreplace',
109 newline='\n',
110 write_through=True)
111
Fred Drake45cd9de2000-06-29 19:34:54 +0000112class XMLGenerator(handler.ContentHandler):
113
R. David Murraya90032a2010-10-17 22:46:45 +0000114 def __init__(self, out=None, encoding="iso-8859-1", short_empty_elements=False):
Fred Drake45cd9de2000-06-29 19:34:54 +0000115 handler.ContentHandler.__init__(self)
Serhiy Storchaka88efc522013-02-10 14:29:52 +0200116 out = _gettextwriter(out, encoding)
117 self._write = out.write
118 self._flush = out.flush
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000119 self._ns_contexts = [{}] # contains uri -> prefix dicts
120 self._current_context = self._ns_contexts[-1]
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000121 self._undeclared_ns_maps = []
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000122 self._encoding = encoding
R. David Murraya90032a2010-10-17 22:46:45 +0000123 self._short_empty_elements = short_empty_elements
124 self._pending_start_element = False
Fred Drake45cd9de2000-06-29 19:34:54 +0000125
Thomas Wouterscf297e42007-02-23 15:07:44 +0000126 def _qname(self, name):
127 """Builds a qualified name from a (ns_url, localname) pair"""
128 if name[0]:
Antoine Pitrou6b03ee62010-10-27 18:33:30 +0000129 # Per http://www.w3.org/XML/1998/namespace, The 'xml' prefix is
130 # bound by definition to http://www.w3.org/XML/1998/namespace. It
131 # does not need to be declared and will not usually be found in
132 # self._current_context.
133 if 'http://www.w3.org/XML/1998/namespace' == name[0]:
134 return 'xml:' + name[1]
Thomas Wouterscf297e42007-02-23 15:07:44 +0000135 # The name is in a non-empty namespace
136 prefix = self._current_context[name[0]]
137 if prefix:
138 # If it is not the default namespace, prepend the prefix
139 return prefix + ":" + name[1]
140 # Return the unqualified name
141 return name[1]
142
R. David Murraya90032a2010-10-17 22:46:45 +0000143 def _finish_pending_start_element(self,endElement=False):
144 if self._pending_start_element:
145 self._write('>')
146 self._pending_start_element = False
147
Fred Drake45cd9de2000-06-29 19:34:54 +0000148 # ContentHandler methods
Fred Drakea12adfe2000-09-18 17:40:22 +0000149
Fred Drake45cd9de2000-06-29 19:34:54 +0000150 def startDocument(self):
Martin v. Löwisae207222004-05-06 02:22:43 +0000151 self._write('<?xml version="1.0" encoding="%s"?>\n' %
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000152 self._encoding)
Fred Drake45cd9de2000-06-29 19:34:54 +0000153
Serhiy Storchaka88efc522013-02-10 14:29:52 +0200154 def endDocument(self):
155 self._flush()
156
Fred Drake45cd9de2000-06-29 19:34:54 +0000157 def startPrefixMapping(self, prefix, uri):
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000158 self._ns_contexts.append(self._current_context.copy())
159 self._current_context[uri] = prefix
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000160 self._undeclared_ns_maps.append((prefix, uri))
Fred Drake45cd9de2000-06-29 19:34:54 +0000161
162 def endPrefixMapping(self, prefix):
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000163 self._current_context = self._ns_contexts[-1]
164 del self._ns_contexts[-1]
Fred Drake45cd9de2000-06-29 19:34:54 +0000165
166 def startElement(self, name, attrs):
R. David Murraya90032a2010-10-17 22:46:45 +0000167 self._finish_pending_start_element()
Martin v. Löwisae207222004-05-06 02:22:43 +0000168 self._write('<' + name)
Fred Drake45cd9de2000-06-29 19:34:54 +0000169 for (name, value) in attrs.items():
Martin v. Löwisae207222004-05-06 02:22:43 +0000170 self._write(' %s=%s' % (name, quoteattr(value)))
R. David Murraya90032a2010-10-17 22:46:45 +0000171 if self._short_empty_elements:
172 self._pending_start_element = True
173 else:
174 self._write(">")
Fred Drake16f63292000-10-23 18:09:50 +0000175
Fred Drake45cd9de2000-06-29 19:34:54 +0000176 def endElement(self, name):
R. David Murraya90032a2010-10-17 22:46:45 +0000177 if self._pending_start_element:
178 self._write('/>')
179 self._pending_start_element = False
180 else:
181 self._write('</%s>' % name)
Fred Drake45cd9de2000-06-29 19:34:54 +0000182
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000183 def startElementNS(self, name, qname, attrs):
R. David Murraya90032a2010-10-17 22:46:45 +0000184 self._finish_pending_start_element()
Thomas Wouterscf297e42007-02-23 15:07:44 +0000185 self._write('<' + self._qname(name))
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000186
Thomas Wouterscf297e42007-02-23 15:07:44 +0000187 for prefix, uri in self._undeclared_ns_maps:
188 if prefix:
Serhiy Storchaka88efc522013-02-10 14:29:52 +0200189 self._write(' xmlns:%s="%s"' % (prefix, uri))
Thomas Wouterscf297e42007-02-23 15:07:44 +0000190 else:
Serhiy Storchaka88efc522013-02-10 14:29:52 +0200191 self._write(' xmlns="%s"' % uri)
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000192 self._undeclared_ns_maps = []
Fred Drake16f63292000-10-23 18:09:50 +0000193
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000194 for (name, value) in attrs.items():
Thomas Wouterscf297e42007-02-23 15:07:44 +0000195 self._write(' %s=%s' % (self._qname(name), quoteattr(value)))
R. David Murraya90032a2010-10-17 22:46:45 +0000196 if self._short_empty_elements:
197 self._pending_start_element = True
198 else:
199 self._write(">")
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000200
201 def endElementNS(self, name, qname):
R. David Murraya90032a2010-10-17 22:46:45 +0000202 if self._pending_start_element:
203 self._write('/>')
204 self._pending_start_element = False
205 else:
206 self._write('</%s>' % self._qname(name))
Fred Drake16f63292000-10-23 18:09:50 +0000207
Fred Drake45cd9de2000-06-29 19:34:54 +0000208 def characters(self, content):
R. David Murraya90032a2010-10-17 22:46:45 +0000209 if content:
210 self._finish_pending_start_element()
Serhiy Storchaka3eab6b32013-05-12 17:31:16 +0300211 if not isinstance(content, str):
212 content = str(content, self._encoding)
R. David Murraya90032a2010-10-17 22:46:45 +0000213 self._write(escape(content))
Fred Drake45cd9de2000-06-29 19:34:54 +0000214
215 def ignorableWhitespace(self, content):
R. David Murraya90032a2010-10-17 22:46:45 +0000216 if content:
217 self._finish_pending_start_element()
Serhiy Storchaka3eab6b32013-05-12 17:31:16 +0300218 if not isinstance(content, str):
219 content = str(content, self._encoding)
R. David Murraya90032a2010-10-17 22:46:45 +0000220 self._write(content)
Fred Drakea12adfe2000-09-18 17:40:22 +0000221
Fred Drake45cd9de2000-06-29 19:34:54 +0000222 def processingInstruction(self, target, data):
R. David Murraya90032a2010-10-17 22:46:45 +0000223 self._finish_pending_start_element()
Martin v. Löwisae207222004-05-06 02:22:43 +0000224 self._write('<?%s %s?>' % (target, data))
Fred Drake45cd9de2000-06-29 19:34:54 +0000225
Fred Drakea12adfe2000-09-18 17:40:22 +0000226
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000227class XMLFilterBase(xmlreader.XMLReader):
Fred Drake45cd9de2000-06-29 19:34:54 +0000228 """This class is designed to sit between an XMLReader and the
229 client application's event handlers. By default, it does nothing
230 but pass requests up to the reader and events on to the handlers
231 unmodified, but subclasses can override specific methods to modify
232 the event stream or the configuration requests as they pass
233 through."""
234
Lars Gustäbelbc1b5c82000-10-11 22:35:00 +0000235 def __init__(self, parent = None):
236 xmlreader.XMLReader.__init__(self)
237 self._parent = parent
Fred Drake16f63292000-10-23 18:09:50 +0000238
Fred Drake45cd9de2000-06-29 19:34:54 +0000239 # ErrorHandler methods
240
241 def error(self, exception):
242 self._err_handler.error(exception)
243
244 def fatalError(self, exception):
245 self._err_handler.fatalError(exception)
246
247 def warning(self, exception):
248 self._err_handler.warning(exception)
249
250 # ContentHandler methods
Fred Drakea12adfe2000-09-18 17:40:22 +0000251
Fred Drake45cd9de2000-06-29 19:34:54 +0000252 def setDocumentLocator(self, locator):
253 self._cont_handler.setDocumentLocator(locator)
Fred Drakea12adfe2000-09-18 17:40:22 +0000254
Fred Drake45cd9de2000-06-29 19:34:54 +0000255 def startDocument(self):
256 self._cont_handler.startDocument()
257
258 def endDocument(self):
259 self._cont_handler.endDocument()
260
261 def startPrefixMapping(self, prefix, uri):
262 self._cont_handler.startPrefixMapping(prefix, uri)
263
264 def endPrefixMapping(self, prefix):
265 self._cont_handler.endPrefixMapping(prefix)
266
267 def startElement(self, name, attrs):
268 self._cont_handler.startElement(name, attrs)
269
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000270 def endElement(self, name):
271 self._cont_handler.endElement(name)
272
273 def startElementNS(self, name, qname, attrs):
Martin v. Löwis0ea558f2004-05-06 02:04:21 +0000274 self._cont_handler.startElementNS(name, qname, attrs)
Lars Gustäbelc5cec512000-09-21 08:25:28 +0000275
276 def endElementNS(self, name, qname):
277 self._cont_handler.endElementNS(name, qname)
Fred Drake45cd9de2000-06-29 19:34:54 +0000278
279 def characters(self, content):
280 self._cont_handler.characters(content)
281
Lars Gustäbelfc643c32000-09-24 10:53:31 +0000282 def ignorableWhitespace(self, chars):
283 self._cont_handler.ignorableWhitespace(chars)
Fred Drake45cd9de2000-06-29 19:34:54 +0000284
285 def processingInstruction(self, target, data):
286 self._cont_handler.processingInstruction(target, data)
287
288 def skippedEntity(self, name):
289 self._cont_handler.skippedEntity(name)
290
291 # DTDHandler methods
292
293 def notationDecl(self, name, publicId, systemId):
294 self._dtd_handler.notationDecl(name, publicId, systemId)
295
296 def unparsedEntityDecl(self, name, publicId, systemId, ndata):
297 self._dtd_handler.unparsedEntityDecl(name, publicId, systemId, ndata)
298
299 # EntityResolver methods
300
301 def resolveEntity(self, publicId, systemId):
Fred Drakee4772f32005-02-03 17:31:39 +0000302 return self._ent_handler.resolveEntity(publicId, systemId)
Fred Drake45cd9de2000-06-29 19:34:54 +0000303
304 # XMLReader methods
305
306 def parse(self, source):
307 self._parent.setContentHandler(self)
308 self._parent.setErrorHandler(self)
309 self._parent.setEntityResolver(self)
310 self._parent.setDTDHandler(self)
311 self._parent.parse(source)
312
313 def setLocale(self, locale):
314 self._parent.setLocale(locale)
Fred Drakea12adfe2000-09-18 17:40:22 +0000315
Fred Drake45cd9de2000-06-29 19:34:54 +0000316 def getFeature(self, name):
317 return self._parent.getFeature(name)
318
319 def setFeature(self, name, state):
320 self._parent.setFeature(name, state)
321
322 def getProperty(self, name):
323 return self._parent.getProperty(name)
324
325 def setProperty(self, name, value):
326 self._parent.setProperty(name, value)
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000327
Lars Gustäbelbc1b5c82000-10-11 22:35:00 +0000328 # XMLFilter methods
329
330 def getParent(self):
331 return self._parent
332
333 def setParent(self, parent):
334 self._parent = parent
335
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000336# --- Utility functions
337
Georg Brandlfe991052009-09-16 15:54:04 +0000338def prepare_input_source(source, base=""):
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000339 """This function takes an InputSource and an optional base URL and
340 returns a fully resolved InputSource object ready for reading."""
Fred Drake16f63292000-10-23 18:09:50 +0000341
Mickaël Schoentgen929b7042019-04-14 09:16:54 +0000342 if isinstance(source, os.PathLike):
343 source = os.fspath(source)
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000344 if isinstance(source, str):
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000345 source = xmlreader.InputSource(source)
Martin v. Löwis58af43f2000-09-24 21:31:06 +0000346 elif hasattr(source, "read"):
347 f = source
Martin v. Löwis5fece7f2000-10-06 21:11:20 +0000348 source = xmlreader.InputSource()
Serhiy Storchaka61de0872015-04-02 21:00:13 +0300349 if isinstance(f.read(0), str):
350 source.setCharacterStream(f)
351 else:
352 source.setByteStream(f)
Serhiy Storchakafc8e9b02014-11-27 22:13:16 +0200353 if hasattr(f, "name") and isinstance(f.name, str):
Lars Gustäbel4ced5e72000-10-24 15:53:12 +0000354 source.setSystemId(f.name)
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000355
Serhiy Storchaka61de0872015-04-02 21:00:13 +0300356 if source.getCharacterStream() is None and source.getByteStream() is None:
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000357 sysid = source.getSystemId()
Fred Drake910b2822004-10-20 11:08:35 +0000358 basehead = os.path.dirname(os.path.normpath(base))
Raymond Hettinger06d9b1f2004-10-20 08:21:19 +0000359 sysidfilename = os.path.join(basehead, sysid)
360 if os.path.isfile(sysidfilename):
361 source.setSystemId(sysidfilename)
362 f = open(sysidfilename, "rb")
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000363 else:
Jeremy Hylton1afc1692008-06-18 20:49:58 +0000364 source.setSystemId(urllib.parse.urljoin(base, sysid))
365 f = urllib.request.urlopen(source.getSystemId())
Fred Drake16f63292000-10-23 18:09:50 +0000366
Fred Drake0872e052000-09-26 17:23:09 +0000367 source.setByteStream(f)
Fred Drake16f63292000-10-23 18:09:50 +0000368
Lars Gustäbel523b0a62000-09-24 18:54:49 +0000369 return source