blob: 81a36b073e8ec6259dcaf26637bb624c6847b024 [file] [log] [blame]
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001import xml.sax
2import xml.sax.handler
Martin v. Löwis011ea472000-12-28 18:43:02 +00003import types
4
Fred Drake1f549022000-09-24 05:21:58 +00005START_ELEMENT = "START_ELEMENT"
6END_ELEMENT = "END_ELEMENT"
7COMMENT = "COMMENT"
8START_DOCUMENT = "START_DOCUMENT"
9END_DOCUMENT = "END_DOCUMENT"
10PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
11IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
12CHARACTERS = "CHARACTERS"
Fred Drake55c38192000-06-29 19:39:57 +000013
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014class PullDOM(xml.sax.ContentHandler):
Fred Drakec16adce2000-12-14 18:00:18 +000015 _locator = None
16 document = None
17
18 def __init__(self, documentFactory=None):
Thomas Wouters0e3f5912006-08-11 14:57:12 +000019 from xml.dom import XML_NAMESPACE
Fred Drakec16adce2000-12-14 18:00:18 +000020 self.documentFactory = documentFactory
Fred Drake1f549022000-09-24 05:21:58 +000021 self.firstEvent = [None, None]
22 self.lastEvent = self.firstEvent
Martin v. Löwis04a1a542001-01-26 18:53:42 +000023 self.elementStack = []
24 self.push = self.elementStack.append
25 try:
26 self.pop = self.elementStack.pop
27 except AttributeError:
28 # use class' pop instead
29 pass
Martin v. Löwis0e2d8812002-06-30 07:32:56 +000030 self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000031 self._current_context = self._ns_contexts[-1]
Martin v. Löwis126f2f62001-03-13 10:50:13 +000032 self.pending_events = []
Fred Drake55c38192000-06-29 19:39:57 +000033
Martin v. Löwis04a1a542001-01-26 18:53:42 +000034 def pop(self):
35 result = self.elementStack[-1]
Martin v. Löwis52ce0d02001-01-27 08:47:37 +000036 del self.elementStack[-1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +000037 return result
38
Fred Drakec16adce2000-12-14 18:00:18 +000039 def setDocumentLocator(self, locator):
40 self._locator = locator
Fred Drake55c38192000-06-29 19:39:57 +000041
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000042 def startPrefixMapping(self, prefix, uri):
Martin v. Löwis984158d2001-07-18 15:30:25 +000043 if not hasattr(self, '_xmlns_attrs'):
44 self._xmlns_attrs = []
45 self._xmlns_attrs.append((prefix or 'xmlns', uri))
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000046 self._ns_contexts.append(self._current_context.copy())
Fred Drake7fd173b2001-11-30 22:22:26 +000047 self._current_context[uri] = prefix or None
Fred Drake55c38192000-06-29 19:39:57 +000048
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000049 def endPrefixMapping(self, prefix):
Fred Drakec16adce2000-12-14 18:00:18 +000050 self._current_context = self._ns_contexts.pop()
Fred Drake1f549022000-09-24 05:21:58 +000051
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000052 def startElementNS(self, name, tagName , attrs):
Martin v. Löwis984158d2001-07-18 15:30:25 +000053 # Retrieve xml namespace declaration attributes.
54 xmlns_uri = 'http://www.w3.org/2000/xmlns/'
55 xmlns_attrs = getattr(self, '_xmlns_attrs', None)
56 if xmlns_attrs is not None:
57 for aname, value in xmlns_attrs:
58 attrs._attrs[(xmlns_uri, aname)] = value
59 self._xmlns_attrs = []
Fred Drakec16adce2000-12-14 18:00:18 +000060 uri, localname = name
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000061 if uri:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000062 # When using namespaces, the reader may or may not
63 # provide us with the original name. If not, create
64 # *a* valid tagName from the current context.
65 if tagName is None:
Guido van Rossum795ad562001-02-05 18:50:15 +000066 prefix = self._current_context[uri]
67 if prefix:
68 tagName = prefix + ":" + localname
69 else:
70 tagName = localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000071 if self.document:
72 node = self.document.createElementNS(uri, tagName)
73 else:
74 node = self.buildDocument(uri, tagName)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000075 else:
76 # When the tagname is not prefixed, it just appears as
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000077 # localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000078 if self.document:
79 node = self.document.createElement(localname)
80 else:
81 node = self.buildDocument(None, localname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000082
83 for aname,value in attrs.items():
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000084 a_uri, a_localname = aname
Martin v. Löwis984158d2001-07-18 15:30:25 +000085 if a_uri == xmlns_uri:
86 if a_localname == 'xmlns':
87 qname = a_localname
88 else:
89 qname = 'xmlns:' + a_localname
90 attr = self.document.createAttributeNS(a_uri, qname)
91 node.setAttributeNodeNS(attr)
92 elif a_uri:
Guido van Rossum795ad562001-02-05 18:50:15 +000093 prefix = self._current_context[a_uri]
94 if prefix:
95 qname = prefix + ":" + a_localname
96 else:
97 qname = a_localname
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000098 attr = self.document.createAttributeNS(a_uri, qname)
Fred Drake6526bf82001-03-23 04:39:24 +000099 node.setAttributeNodeNS(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000100 else:
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +0000101 attr = self.document.createAttribute(a_localname)
Fred Drake6526bf82001-03-23 04:39:24 +0000102 node.setAttributeNode(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000103 attr.value = value
Fred Drake16f63292000-10-23 18:09:50 +0000104
Fred Drake1f549022000-09-24 05:21:58 +0000105 self.lastEvent[1] = [(START_ELEMENT, node), None]
106 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000107 self.push(node)
Fred Drake55c38192000-06-29 19:39:57 +0000108
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000109 def endElementNS(self, name, tagName):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000110 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Fred Drake1f549022000-09-24 05:21:58 +0000111 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000112
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000113 def startElement(self, name, attrs):
Martin v. Löwisb417be22001-02-06 01:16:06 +0000114 if self.document:
115 node = self.document.createElement(name)
116 else:
117 node = self.buildDocument(None, name)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000118
119 for aname,value in attrs.items():
120 attr = self.document.createAttribute(aname)
121 attr.value = value
122 node.setAttributeNode(attr)
Fred Drake16f63292000-10-23 18:09:50 +0000123
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000124 self.lastEvent[1] = [(START_ELEMENT, node), None]
125 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000126 self.push(node)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000127
128 def endElement(self, name):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000129 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000130 self.lastEvent = self.lastEvent[1]
Fred Drake16f63292000-10-23 18:09:50 +0000131
Fred Drake1f549022000-09-24 05:21:58 +0000132 def comment(self, s):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000133 if self.document:
134 node = self.document.createComment(s)
135 self.lastEvent[1] = [(COMMENT, node), None]
136 self.lastEvent = self.lastEvent[1]
137 else:
138 event = [(COMMENT, s), None]
139 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000140
Fred Drake1f549022000-09-24 05:21:58 +0000141 def processingInstruction(self, target, data):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000142 if self.document:
143 node = self.document.createProcessingInstruction(target, data)
144 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
145 self.lastEvent = self.lastEvent[1]
146 else:
147 event = [(PROCESSING_INSTRUCTION, target, data), None]
148 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000149
Fred Drake1f549022000-09-24 05:21:58 +0000150 def ignorableWhitespace(self, chars):
Fred Drakec16adce2000-12-14 18:00:18 +0000151 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000152 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
153 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000154
Fred Drake1f549022000-09-24 05:21:58 +0000155 def characters(self, chars):
156 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000157 self.lastEvent[1] = [(CHARACTERS, node), None]
158 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000159
Fred Drake1f549022000-09-24 05:21:58 +0000160 def startDocument(self):
Fred Drakec16adce2000-12-14 18:00:18 +0000161 if self.documentFactory is None:
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000162 import xml.dom.minidom
163 self.documentFactory = xml.dom.minidom.Document.implementation
Martin v. Löwisb417be22001-02-06 01:16:06 +0000164
165 def buildDocument(self, uri, tagname):
166 # Can't do that in startDocument, since we need the tagname
167 # XXX: obtain DocumentType
168 node = self.documentFactory.createDocument(uri, tagname, None)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000169 self.document = node
Fred Drake1f549022000-09-24 05:21:58 +0000170 self.lastEvent[1] = [(START_DOCUMENT, node), None]
171 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000172 self.push(node)
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000173 # Put everything we have seen so far into the document
174 for e in self.pending_events:
175 if e[0][0] == PROCESSING_INSTRUCTION:
176 _,target,data = e[0]
177 n = self.document.createProcessingInstruction(target, data)
178 e[0] = (PROCESSING_INSTRUCTION, n)
179 elif e[0][0] == COMMENT:
180 n = self.document.createComment(e[0][1])
181 e[0] = (COMMENT, n)
182 else:
183 raise AssertionError("Unknown pending event ",e[0][0])
184 self.lastEvent[1] = e
185 self.lastEvent = e
186 self.pending_events = None
Martin v. Löwisb417be22001-02-06 01:16:06 +0000187 return node.firstChild
Fred Drake55c38192000-06-29 19:39:57 +0000188
Fred Drake1f549022000-09-24 05:21:58 +0000189 def endDocument(self):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000190 self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
191 self.pop()
Fred Drake55c38192000-06-29 19:39:57 +0000192
Martin v. Löwisb417be22001-02-06 01:16:06 +0000193 def clear(self):
194 "clear(): Explicitly release parsing structures"
195 self.document = None
196
Fred Drake55c38192000-06-29 19:39:57 +0000197class ErrorHandler:
Fred Drake1f549022000-09-24 05:21:58 +0000198 def warning(self, exception):
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000199 print(exception)
Fred Drake1f549022000-09-24 05:21:58 +0000200 def error(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000201 raise exception
Fred Drake1f549022000-09-24 05:21:58 +0000202 def fatalError(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000203 raise exception
Fred Drake55c38192000-06-29 19:39:57 +0000204
205class DOMEventStream:
Fred Drake1f549022000-09-24 05:21:58 +0000206 def __init__(self, stream, parser, bufsize):
207 self.stream = stream
208 self.parser = parser
209 self.bufsize = bufsize
Fred Drake7fd173b2001-11-30 22:22:26 +0000210 if not hasattr(self.parser, 'feed'):
211 self.getEvent = self._slurp
Fred Drake55c38192000-06-29 19:39:57 +0000212 self.reset()
213
Fred Drake1f549022000-09-24 05:21:58 +0000214 def reset(self):
Fred Drake55c38192000-06-29 19:39:57 +0000215 self.pulldom = PullDOM()
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000216 # This content handler relies on namespace support
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000217 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
Fred Drake1f549022000-09-24 05:21:58 +0000218 self.parser.setContentHandler(self.pulldom)
Fred Drake55c38192000-06-29 19:39:57 +0000219
Fred Drake1f549022000-09-24 05:21:58 +0000220 def __getitem__(self, pos):
221 rc = self.getEvent()
222 if rc:
223 return rc
Fred Drake55c38192000-06-29 19:39:57 +0000224 raise IndexError
225
Georg Brandla18af4e2007-04-21 15:47:16 +0000226 def __next__(self):
Andrew M. Kuchlingbdf1f192002-03-20 23:56:34 +0000227 rc = self.getEvent()
228 if rc:
229 return rc
230 raise StopIteration
231
232 def __iter__(self):
233 return self
Tim Peters0eadaac2003-04-24 16:02:54 +0000234
Fred Drake1f549022000-09-24 05:21:58 +0000235 def expandNode(self, node):
236 event = self.getEvent()
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000237 parents = [node]
Fred Drake55c38192000-06-29 19:39:57 +0000238 while event:
Fred Drake1f549022000-09-24 05:21:58 +0000239 token, cur_node = event
240 if cur_node is node:
241 return
Lars Gustäbelec964d52000-10-13 20:53:27 +0000242 if token != END_ELEMENT:
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000243 parents[-1].appendChild(cur_node)
244 if token == START_ELEMENT:
245 parents.append(cur_node)
246 elif token == END_ELEMENT:
247 del parents[-1]
Fred Drake1f549022000-09-24 05:21:58 +0000248 event = self.getEvent()
Fred Drake55c38192000-06-29 19:39:57 +0000249
Fred Drake1f549022000-09-24 05:21:58 +0000250 def getEvent(self):
Fred Drake7fd173b2001-11-30 22:22:26 +0000251 # use IncrementalParser interface, so we get the desired
252 # pull effect
Fred Drake55c38192000-06-29 19:39:57 +0000253 if not self.pulldom.firstEvent[1]:
Fred Drake1f549022000-09-24 05:21:58 +0000254 self.pulldom.lastEvent = self.pulldom.firstEvent
Fred Drake55c38192000-06-29 19:39:57 +0000255 while not self.pulldom.firstEvent[1]:
Fred Drakec16adce2000-12-14 18:00:18 +0000256 buf = self.stream.read(self.bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000257 if not buf:
Martin v. Löwise3fc7222001-01-27 08:34:21 +0000258 self.parser.close()
Fred Drake55c38192000-06-29 19:39:57 +0000259 return None
Fred Drake1f549022000-09-24 05:21:58 +0000260 self.parser.feed(buf)
261 rc = self.pulldom.firstEvent[1][0]
262 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
Fred Drake55c38192000-06-29 19:39:57 +0000263 return rc
264
Fred Drake7fd173b2001-11-30 22:22:26 +0000265 def _slurp(self):
266 """ Fallback replacement for getEvent() using the
267 standard SAX2 interface, which means we slurp the
268 SAX events into memory (no performance gain, but
269 we are compatible to all SAX parsers).
270 """
271 self.parser.parse(self.stream)
272 self.getEvent = self._emit
273 return self._emit()
274
275 def _emit(self):
276 """ Fallback replacement for getEvent() that emits
277 the events that _slurp() read previously.
278 """
279 rc = self.pulldom.firstEvent[1][0]
280 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
281 return rc
282
Martin v. Löwisb417be22001-02-06 01:16:06 +0000283 def clear(self):
Fred Drake7fd173b2001-11-30 22:22:26 +0000284 """clear(): Explicitly release parsing objects"""
Martin v. Löwisb417be22001-02-06 01:16:06 +0000285 self.pulldom.clear()
286 del self.pulldom
287 self.parser = None
288 self.stream = None
289
Lars Gustäbelec964d52000-10-13 20:53:27 +0000290class SAX2DOM(PullDOM):
291
292 def startElementNS(self, name, tagName , attrs):
293 PullDOM.startElementNS(self, name, tagName, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000294 curNode = self.elementStack[-1]
295 parentNode = self.elementStack[-2]
296 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000297
298 def startElement(self, name, attrs):
299 PullDOM.startElement(self, name, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000300 curNode = self.elementStack[-1]
301 parentNode = self.elementStack[-2]
302 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000303
304 def processingInstruction(self, target, data):
305 PullDOM.processingInstruction(self, target, data)
306 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000307 parentNode = self.elementStack[-1]
308 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000309
310 def ignorableWhitespace(self, chars):
311 PullDOM.ignorableWhitespace(self, chars)
312 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000313 parentNode = self.elementStack[-1]
314 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000315
316 def characters(self, chars):
317 PullDOM.characters(self, chars)
318 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000319 parentNode = self.elementStack[-1]
320 parentNode.appendChild(node)
Fred Drake16f63292000-10-23 18:09:50 +0000321
Fred Drakec16adce2000-12-14 18:00:18 +0000322
Fred Drake1f549022000-09-24 05:21:58 +0000323default_bufsize = (2 ** 14) - 20
324
Fred Drakec16adce2000-12-14 18:00:18 +0000325def parse(stream_or_string, parser=None, bufsize=None):
326 if bufsize is None:
327 bufsize = default_bufsize
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000328 if isinstance(stream_or_string, str):
Fred Drake1f549022000-09-24 05:21:58 +0000329 stream = open(stream_or_string)
Fred Drake55c38192000-06-29 19:39:57 +0000330 else:
Fred Drake1f549022000-09-24 05:21:58 +0000331 stream = stream_or_string
Fred Drake16f63292000-10-23 18:09:50 +0000332 if not parser:
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000333 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000334 return DOMEventStream(stream, parser, bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000335
Fred Drake1f549022000-09-24 05:21:58 +0000336def parseString(string, parser=None):
Fred Drake55c38192000-06-29 19:39:57 +0000337 try:
Guido van Rossum34d19282007-08-09 01:03:29 +0000338 from io import StringIO
Fred Drake55c38192000-06-29 19:39:57 +0000339 except ImportError:
Guido van Rossum34d19282007-08-09 01:03:29 +0000340 from io import StringIO
Fred Drake16f63292000-10-23 18:09:50 +0000341
Fred Drake1f549022000-09-24 05:21:58 +0000342 bufsize = len(string)
343 buf = StringIO(string)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000344 if not parser:
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000345 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000346 return DOMEventStream(buf, parser, bufsize)