blob: 43504f76561652f3384b8b4ba62c12aea1fc4122 [file] [log] [blame]
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001import xml.sax
2import xml.sax.handler
Martin v. Löwis011ea472000-12-28 18:43:02 +00003
Fred Drake1f549022000-09-24 05:21:58 +00004START_ELEMENT = "START_ELEMENT"
5END_ELEMENT = "END_ELEMENT"
6COMMENT = "COMMENT"
7START_DOCUMENT = "START_DOCUMENT"
8END_DOCUMENT = "END_DOCUMENT"
9PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
10IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
11CHARACTERS = "CHARACTERS"
Fred Drake55c38192000-06-29 19:39:57 +000012
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013class PullDOM(xml.sax.ContentHandler):
Fred Drakec16adce2000-12-14 18:00:18 +000014 _locator = None
15 document = None
16
17 def __init__(self, documentFactory=None):
Thomas Wouters0e3f5912006-08-11 14:57:12 +000018 from xml.dom import XML_NAMESPACE
Fred Drakec16adce2000-12-14 18:00:18 +000019 self.documentFactory = documentFactory
Fred Drake1f549022000-09-24 05:21:58 +000020 self.firstEvent = [None, None]
21 self.lastEvent = self.firstEvent
Martin v. Löwis04a1a542001-01-26 18:53:42 +000022 self.elementStack = []
23 self.push = self.elementStack.append
24 try:
25 self.pop = self.elementStack.pop
26 except AttributeError:
27 # use class' pop instead
28 pass
Martin v. Löwis0e2d8812002-06-30 07:32:56 +000029 self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000030 self._current_context = self._ns_contexts[-1]
Martin v. Löwis126f2f62001-03-13 10:50:13 +000031 self.pending_events = []
Fred Drake55c38192000-06-29 19:39:57 +000032
Martin v. Löwis04a1a542001-01-26 18:53:42 +000033 def pop(self):
34 result = self.elementStack[-1]
Martin v. Löwis52ce0d02001-01-27 08:47:37 +000035 del self.elementStack[-1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +000036 return result
37
Fred Drakec16adce2000-12-14 18:00:18 +000038 def setDocumentLocator(self, locator):
39 self._locator = locator
Fred Drake55c38192000-06-29 19:39:57 +000040
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000041 def startPrefixMapping(self, prefix, uri):
Martin v. Löwis984158d2001-07-18 15:30:25 +000042 if not hasattr(self, '_xmlns_attrs'):
43 self._xmlns_attrs = []
44 self._xmlns_attrs.append((prefix or 'xmlns', uri))
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000045 self._ns_contexts.append(self._current_context.copy())
Fred Drake7fd173b2001-11-30 22:22:26 +000046 self._current_context[uri] = prefix or None
Fred Drake55c38192000-06-29 19:39:57 +000047
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000048 def endPrefixMapping(self, prefix):
Fred Drakec16adce2000-12-14 18:00:18 +000049 self._current_context = self._ns_contexts.pop()
Fred Drake1f549022000-09-24 05:21:58 +000050
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000051 def startElementNS(self, name, tagName , attrs):
Martin v. Löwis984158d2001-07-18 15:30:25 +000052 # Retrieve xml namespace declaration attributes.
53 xmlns_uri = 'http://www.w3.org/2000/xmlns/'
54 xmlns_attrs = getattr(self, '_xmlns_attrs', None)
55 if xmlns_attrs is not None:
56 for aname, value in xmlns_attrs:
57 attrs._attrs[(xmlns_uri, aname)] = value
58 self._xmlns_attrs = []
Fred Drakec16adce2000-12-14 18:00:18 +000059 uri, localname = name
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000060 if uri:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000061 # When using namespaces, the reader may or may not
62 # provide us with the original name. If not, create
63 # *a* valid tagName from the current context.
64 if tagName is None:
Guido van Rossum795ad562001-02-05 18:50:15 +000065 prefix = self._current_context[uri]
66 if prefix:
67 tagName = prefix + ":" + localname
68 else:
69 tagName = localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000070 if self.document:
71 node = self.document.createElementNS(uri, tagName)
72 else:
73 node = self.buildDocument(uri, tagName)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000074 else:
75 # When the tagname is not prefixed, it just appears as
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000076 # localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000077 if self.document:
78 node = self.document.createElement(localname)
79 else:
80 node = self.buildDocument(None, localname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000081
82 for aname,value in attrs.items():
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000083 a_uri, a_localname = aname
Martin v. Löwis984158d2001-07-18 15:30:25 +000084 if a_uri == xmlns_uri:
85 if a_localname == 'xmlns':
86 qname = a_localname
87 else:
88 qname = 'xmlns:' + a_localname
89 attr = self.document.createAttributeNS(a_uri, qname)
90 node.setAttributeNodeNS(attr)
91 elif a_uri:
Guido van Rossum795ad562001-02-05 18:50:15 +000092 prefix = self._current_context[a_uri]
93 if prefix:
94 qname = prefix + ":" + a_localname
95 else:
96 qname = a_localname
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000097 attr = self.document.createAttributeNS(a_uri, qname)
Fred Drake6526bf82001-03-23 04:39:24 +000098 node.setAttributeNodeNS(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000099 else:
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +0000100 attr = self.document.createAttribute(a_localname)
Fred Drake6526bf82001-03-23 04:39:24 +0000101 node.setAttributeNode(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000102 attr.value = value
Fred Drake16f63292000-10-23 18:09:50 +0000103
Fred Drake1f549022000-09-24 05:21:58 +0000104 self.lastEvent[1] = [(START_ELEMENT, node), None]
105 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000106 self.push(node)
Fred Drake55c38192000-06-29 19:39:57 +0000107
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000108 def endElementNS(self, name, tagName):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000109 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Fred Drake1f549022000-09-24 05:21:58 +0000110 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000111
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000112 def startElement(self, name, attrs):
Martin v. Löwisb417be22001-02-06 01:16:06 +0000113 if self.document:
114 node = self.document.createElement(name)
115 else:
116 node = self.buildDocument(None, name)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000117
118 for aname,value in attrs.items():
119 attr = self.document.createAttribute(aname)
120 attr.value = value
121 node.setAttributeNode(attr)
Fred Drake16f63292000-10-23 18:09:50 +0000122
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000123 self.lastEvent[1] = [(START_ELEMENT, node), None]
124 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000125 self.push(node)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000126
127 def endElement(self, name):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000128 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000129 self.lastEvent = self.lastEvent[1]
Fred Drake16f63292000-10-23 18:09:50 +0000130
Fred Drake1f549022000-09-24 05:21:58 +0000131 def comment(self, s):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000132 if self.document:
133 node = self.document.createComment(s)
134 self.lastEvent[1] = [(COMMENT, node), None]
135 self.lastEvent = self.lastEvent[1]
136 else:
137 event = [(COMMENT, s), None]
138 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000139
Fred Drake1f549022000-09-24 05:21:58 +0000140 def processingInstruction(self, target, data):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000141 if self.document:
142 node = self.document.createProcessingInstruction(target, data)
143 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
144 self.lastEvent = self.lastEvent[1]
145 else:
146 event = [(PROCESSING_INSTRUCTION, target, data), None]
147 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000148
Fred Drake1f549022000-09-24 05:21:58 +0000149 def ignorableWhitespace(self, chars):
Fred Drakec16adce2000-12-14 18:00:18 +0000150 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000151 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
152 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000153
Fred Drake1f549022000-09-24 05:21:58 +0000154 def characters(self, chars):
155 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000156 self.lastEvent[1] = [(CHARACTERS, node), None]
157 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000158
Fred Drake1f549022000-09-24 05:21:58 +0000159 def startDocument(self):
Fred Drakec16adce2000-12-14 18:00:18 +0000160 if self.documentFactory is None:
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000161 import xml.dom.minidom
162 self.documentFactory = xml.dom.minidom.Document.implementation
Martin v. Löwisb417be22001-02-06 01:16:06 +0000163
164 def buildDocument(self, uri, tagname):
165 # Can't do that in startDocument, since we need the tagname
166 # XXX: obtain DocumentType
167 node = self.documentFactory.createDocument(uri, tagname, None)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000168 self.document = node
Fred Drake1f549022000-09-24 05:21:58 +0000169 self.lastEvent[1] = [(START_DOCUMENT, node), None]
170 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000171 self.push(node)
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000172 # Put everything we have seen so far into the document
173 for e in self.pending_events:
174 if e[0][0] == PROCESSING_INSTRUCTION:
175 _,target,data = e[0]
176 n = self.document.createProcessingInstruction(target, data)
177 e[0] = (PROCESSING_INSTRUCTION, n)
178 elif e[0][0] == COMMENT:
179 n = self.document.createComment(e[0][1])
180 e[0] = (COMMENT, n)
181 else:
182 raise AssertionError("Unknown pending event ",e[0][0])
183 self.lastEvent[1] = e
184 self.lastEvent = e
185 self.pending_events = None
Martin v. Löwisb417be22001-02-06 01:16:06 +0000186 return node.firstChild
Fred Drake55c38192000-06-29 19:39:57 +0000187
Fred Drake1f549022000-09-24 05:21:58 +0000188 def endDocument(self):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000189 self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
190 self.pop()
Fred Drake55c38192000-06-29 19:39:57 +0000191
Martin v. Löwisb417be22001-02-06 01:16:06 +0000192 def clear(self):
193 "clear(): Explicitly release parsing structures"
194 self.document = None
195
Fred Drake55c38192000-06-29 19:39:57 +0000196class ErrorHandler:
Fred Drake1f549022000-09-24 05:21:58 +0000197 def warning(self, exception):
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000198 print(exception)
Fred Drake1f549022000-09-24 05:21:58 +0000199 def error(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000200 raise exception
Fred Drake1f549022000-09-24 05:21:58 +0000201 def fatalError(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000202 raise exception
Fred Drake55c38192000-06-29 19:39:57 +0000203
204class DOMEventStream:
Fred Drake1f549022000-09-24 05:21:58 +0000205 def __init__(self, stream, parser, bufsize):
206 self.stream = stream
207 self.parser = parser
208 self.bufsize = bufsize
Fred Drake7fd173b2001-11-30 22:22:26 +0000209 if not hasattr(self.parser, 'feed'):
210 self.getEvent = self._slurp
Fred Drake55c38192000-06-29 19:39:57 +0000211 self.reset()
212
Fred Drake1f549022000-09-24 05:21:58 +0000213 def reset(self):
Fred Drake55c38192000-06-29 19:39:57 +0000214 self.pulldom = PullDOM()
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000215 # This content handler relies on namespace support
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000216 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
Fred Drake1f549022000-09-24 05:21:58 +0000217 self.parser.setContentHandler(self.pulldom)
Fred Drake55c38192000-06-29 19:39:57 +0000218
Fred Drake1f549022000-09-24 05:21:58 +0000219 def __getitem__(self, pos):
220 rc = self.getEvent()
221 if rc:
222 return rc
Fred Drake55c38192000-06-29 19:39:57 +0000223 raise IndexError
224
Georg Brandla18af4e2007-04-21 15:47:16 +0000225 def __next__(self):
Andrew M. Kuchlingbdf1f192002-03-20 23:56:34 +0000226 rc = self.getEvent()
227 if rc:
228 return rc
229 raise StopIteration
230
231 def __iter__(self):
232 return self
Tim Peters0eadaac2003-04-24 16:02:54 +0000233
Fred Drake1f549022000-09-24 05:21:58 +0000234 def expandNode(self, node):
235 event = self.getEvent()
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000236 parents = [node]
Fred Drake55c38192000-06-29 19:39:57 +0000237 while event:
Fred Drake1f549022000-09-24 05:21:58 +0000238 token, cur_node = event
239 if cur_node is node:
240 return
Lars Gustäbelec964d52000-10-13 20:53:27 +0000241 if token != END_ELEMENT:
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000242 parents[-1].appendChild(cur_node)
243 if token == START_ELEMENT:
244 parents.append(cur_node)
245 elif token == END_ELEMENT:
246 del parents[-1]
Fred Drake1f549022000-09-24 05:21:58 +0000247 event = self.getEvent()
Fred Drake55c38192000-06-29 19:39:57 +0000248
Fred Drake1f549022000-09-24 05:21:58 +0000249 def getEvent(self):
Fred Drake7fd173b2001-11-30 22:22:26 +0000250 # use IncrementalParser interface, so we get the desired
251 # pull effect
Fred Drake55c38192000-06-29 19:39:57 +0000252 if not self.pulldom.firstEvent[1]:
Fred Drake1f549022000-09-24 05:21:58 +0000253 self.pulldom.lastEvent = self.pulldom.firstEvent
Fred Drake55c38192000-06-29 19:39:57 +0000254 while not self.pulldom.firstEvent[1]:
Fred Drakec16adce2000-12-14 18:00:18 +0000255 buf = self.stream.read(self.bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000256 if not buf:
Martin v. Löwise3fc7222001-01-27 08:34:21 +0000257 self.parser.close()
Fred Drake55c38192000-06-29 19:39:57 +0000258 return None
Fred Drake1f549022000-09-24 05:21:58 +0000259 self.parser.feed(buf)
260 rc = self.pulldom.firstEvent[1][0]
261 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
Fred Drake55c38192000-06-29 19:39:57 +0000262 return rc
263
Fred Drake7fd173b2001-11-30 22:22:26 +0000264 def _slurp(self):
265 """ Fallback replacement for getEvent() using the
266 standard SAX2 interface, which means we slurp the
267 SAX events into memory (no performance gain, but
268 we are compatible to all SAX parsers).
269 """
270 self.parser.parse(self.stream)
271 self.getEvent = self._emit
272 return self._emit()
273
274 def _emit(self):
275 """ Fallback replacement for getEvent() that emits
276 the events that _slurp() read previously.
277 """
278 rc = self.pulldom.firstEvent[1][0]
279 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
280 return rc
281
Martin v. Löwisb417be22001-02-06 01:16:06 +0000282 def clear(self):
Fred Drake7fd173b2001-11-30 22:22:26 +0000283 """clear(): Explicitly release parsing objects"""
Martin v. Löwisb417be22001-02-06 01:16:06 +0000284 self.pulldom.clear()
285 del self.pulldom
286 self.parser = None
287 self.stream = None
288
Lars Gustäbelec964d52000-10-13 20:53:27 +0000289class SAX2DOM(PullDOM):
290
291 def startElementNS(self, name, tagName , attrs):
292 PullDOM.startElementNS(self, name, tagName, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000293 curNode = self.elementStack[-1]
294 parentNode = self.elementStack[-2]
295 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000296
297 def startElement(self, name, attrs):
298 PullDOM.startElement(self, name, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000299 curNode = self.elementStack[-1]
300 parentNode = self.elementStack[-2]
301 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000302
303 def processingInstruction(self, target, data):
304 PullDOM.processingInstruction(self, target, data)
305 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000306 parentNode = self.elementStack[-1]
307 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000308
309 def ignorableWhitespace(self, chars):
310 PullDOM.ignorableWhitespace(self, chars)
311 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000312 parentNode = self.elementStack[-1]
313 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000314
315 def characters(self, chars):
316 PullDOM.characters(self, chars)
317 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000318 parentNode = self.elementStack[-1]
319 parentNode.appendChild(node)
Fred Drake16f63292000-10-23 18:09:50 +0000320
Fred Drakec16adce2000-12-14 18:00:18 +0000321
Fred Drake1f549022000-09-24 05:21:58 +0000322default_bufsize = (2 ** 14) - 20
323
Fred Drakec16adce2000-12-14 18:00:18 +0000324def parse(stream_or_string, parser=None, bufsize=None):
325 if bufsize is None:
326 bufsize = default_bufsize
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000327 if isinstance(stream_or_string, str):
Victor Stinnerbbdc08e2011-07-04 01:25:55 +0200328 stream = open(stream_or_string, 'rb')
Fred Drake55c38192000-06-29 19:39:57 +0000329 else:
Fred Drake1f549022000-09-24 05:21:58 +0000330 stream = stream_or_string
Fred Drake16f63292000-10-23 18:09:50 +0000331 if not parser:
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000332 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000333 return DOMEventStream(stream, parser, bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000334
Fred Drake1f549022000-09-24 05:21:58 +0000335def parseString(string, parser=None):
Florent Xicluna313b2ad2011-12-10 21:14:53 +0100336 from io import StringIO
Fred Drake16f63292000-10-23 18:09:50 +0000337
Fred Drake1f549022000-09-24 05:21:58 +0000338 bufsize = len(string)
339 buf = StringIO(string)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000340 if not parser:
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000341 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000342 return DOMEventStream(buf, parser, bufsize)