blob: b2e1a46abc90b88819fa250ed5730ddcb73c0d9c [file] [log] [blame]
Fred Drakec16adce2000-12-14 18:00:18 +00001import xml.sax
2import xml.sax.handler
Martin v. Löwis011ea472000-12-28 18:43:02 +00003import types
4
5try:
6 _StringTypes = [types.StringType, types.UnicodeType]
7except AttributeError:
8 _StringTypes = [types.StringType]
Fred Drake55c38192000-06-29 19:39:57 +00009
Fred Drake1f549022000-09-24 05:21:58 +000010START_ELEMENT = "START_ELEMENT"
11END_ELEMENT = "END_ELEMENT"
12COMMENT = "COMMENT"
13START_DOCUMENT = "START_DOCUMENT"
14END_DOCUMENT = "END_DOCUMENT"
15PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
16IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
17CHARACTERS = "CHARACTERS"
Fred Drake55c38192000-06-29 19:39:57 +000018
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000019class PullDOM(xml.sax.ContentHandler):
Fred Drakec16adce2000-12-14 18:00:18 +000020 _locator = None
21 document = None
22
23 def __init__(self, documentFactory=None):
Martin v. Löwis0e2d8812002-06-30 07:32:56 +000024 from xml.dom import XML_NAMESPACE
Fred Drakec16adce2000-12-14 18:00:18 +000025 self.documentFactory = documentFactory
Fred Drake1f549022000-09-24 05:21:58 +000026 self.firstEvent = [None, None]
27 self.lastEvent = self.firstEvent
Martin v. Löwis04a1a542001-01-26 18:53:42 +000028 self.elementStack = []
29 self.push = self.elementStack.append
30 try:
31 self.pop = self.elementStack.pop
32 except AttributeError:
33 # use class' pop instead
34 pass
Martin v. Löwis0e2d8812002-06-30 07:32:56 +000035 self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000036 self._current_context = self._ns_contexts[-1]
Martin v. Löwis126f2f62001-03-13 10:50:13 +000037 self.pending_events = []
Fred Drake55c38192000-06-29 19:39:57 +000038
Martin v. Löwis04a1a542001-01-26 18:53:42 +000039 def pop(self):
40 result = self.elementStack[-1]
Martin v. Löwis52ce0d02001-01-27 08:47:37 +000041 del self.elementStack[-1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +000042 return result
43
Fred Drakec16adce2000-12-14 18:00:18 +000044 def setDocumentLocator(self, locator):
45 self._locator = locator
Fred Drake55c38192000-06-29 19:39:57 +000046
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000047 def startPrefixMapping(self, prefix, uri):
Martin v. Löwis984158d2001-07-18 15:30:25 +000048 if not hasattr(self, '_xmlns_attrs'):
49 self._xmlns_attrs = []
50 self._xmlns_attrs.append((prefix or 'xmlns', uri))
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000051 self._ns_contexts.append(self._current_context.copy())
Fred Drake7fd173b2001-11-30 22:22:26 +000052 self._current_context[uri] = prefix or None
Fred Drake55c38192000-06-29 19:39:57 +000053
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000054 def endPrefixMapping(self, prefix):
Fred Drakec16adce2000-12-14 18:00:18 +000055 self._current_context = self._ns_contexts.pop()
Fred Drake1f549022000-09-24 05:21:58 +000056
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000057 def startElementNS(self, name, tagName , attrs):
Martin v. Löwis984158d2001-07-18 15:30:25 +000058 # Retrieve xml namespace declaration attributes.
59 xmlns_uri = 'http://www.w3.org/2000/xmlns/'
60 xmlns_attrs = getattr(self, '_xmlns_attrs', None)
61 if xmlns_attrs is not None:
62 for aname, value in xmlns_attrs:
63 attrs._attrs[(xmlns_uri, aname)] = value
64 self._xmlns_attrs = []
Fred Drakec16adce2000-12-14 18:00:18 +000065 uri, localname = name
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000066 if uri:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000067 # When using namespaces, the reader may or may not
68 # provide us with the original name. If not, create
69 # *a* valid tagName from the current context.
70 if tagName is None:
Guido van Rossum795ad562001-02-05 18:50:15 +000071 prefix = self._current_context[uri]
72 if prefix:
73 tagName = prefix + ":" + localname
74 else:
75 tagName = localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000076 if self.document:
77 node = self.document.createElementNS(uri, tagName)
78 else:
79 node = self.buildDocument(uri, tagName)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000080 else:
81 # When the tagname is not prefixed, it just appears as
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000082 # localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000083 if self.document:
84 node = self.document.createElement(localname)
85 else:
86 node = self.buildDocument(None, localname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000087
88 for aname,value in attrs.items():
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000089 a_uri, a_localname = aname
Martin v. Löwis984158d2001-07-18 15:30:25 +000090 if a_uri == xmlns_uri:
91 if a_localname == 'xmlns':
92 qname = a_localname
93 else:
94 qname = 'xmlns:' + a_localname
95 attr = self.document.createAttributeNS(a_uri, qname)
96 node.setAttributeNodeNS(attr)
97 elif a_uri:
Guido van Rossum795ad562001-02-05 18:50:15 +000098 prefix = self._current_context[a_uri]
99 if prefix:
100 qname = prefix + ":" + a_localname
101 else:
102 qname = a_localname
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +0000103 attr = self.document.createAttributeNS(a_uri, qname)
Fred Drake6526bf82001-03-23 04:39:24 +0000104 node.setAttributeNodeNS(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000105 else:
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +0000106 attr = self.document.createAttribute(a_localname)
Fred Drake6526bf82001-03-23 04:39:24 +0000107 node.setAttributeNode(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000108 attr.value = value
Fred Drake16f63292000-10-23 18:09:50 +0000109
Fred Drake1f549022000-09-24 05:21:58 +0000110 self.lastEvent[1] = [(START_ELEMENT, node), None]
111 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000112 self.push(node)
Fred Drake55c38192000-06-29 19:39:57 +0000113
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000114 def endElementNS(self, name, tagName):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000115 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Fred Drake1f549022000-09-24 05:21:58 +0000116 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000117
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000118 def startElement(self, name, attrs):
Martin v. Löwisb417be22001-02-06 01:16:06 +0000119 if self.document:
120 node = self.document.createElement(name)
121 else:
122 node = self.buildDocument(None, name)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000123
124 for aname,value in attrs.items():
125 attr = self.document.createAttribute(aname)
126 attr.value = value
127 node.setAttributeNode(attr)
Fred Drake16f63292000-10-23 18:09:50 +0000128
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000129 self.lastEvent[1] = [(START_ELEMENT, node), None]
130 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000131 self.push(node)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000132
133 def endElement(self, name):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000134 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000135 self.lastEvent = self.lastEvent[1]
Fred Drake16f63292000-10-23 18:09:50 +0000136
Fred Drake1f549022000-09-24 05:21:58 +0000137 def comment(self, s):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000138 if self.document:
139 node = self.document.createComment(s)
140 self.lastEvent[1] = [(COMMENT, node), None]
141 self.lastEvent = self.lastEvent[1]
142 else:
143 event = [(COMMENT, s), None]
144 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000145
Fred Drake1f549022000-09-24 05:21:58 +0000146 def processingInstruction(self, target, data):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000147 if self.document:
148 node = self.document.createProcessingInstruction(target, data)
149 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
150 self.lastEvent = self.lastEvent[1]
151 else:
152 event = [(PROCESSING_INSTRUCTION, target, data), None]
153 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000154
Fred Drake1f549022000-09-24 05:21:58 +0000155 def ignorableWhitespace(self, chars):
Fred Drakec16adce2000-12-14 18:00:18 +0000156 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000157 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
158 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000159
Fred Drake1f549022000-09-24 05:21:58 +0000160 def characters(self, chars):
161 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000162 self.lastEvent[1] = [(CHARACTERS, node), None]
163 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000164
Fred Drake1f549022000-09-24 05:21:58 +0000165 def startDocument(self):
Fred Drakec16adce2000-12-14 18:00:18 +0000166 if self.documentFactory is None:
167 import xml.dom.minidom
168 self.documentFactory = xml.dom.minidom.Document.implementation
Martin v. Löwisb417be22001-02-06 01:16:06 +0000169
170 def buildDocument(self, uri, tagname):
171 # Can't do that in startDocument, since we need the tagname
172 # XXX: obtain DocumentType
173 node = self.documentFactory.createDocument(uri, tagname, None)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000174 self.document = node
Fred Drake1f549022000-09-24 05:21:58 +0000175 self.lastEvent[1] = [(START_DOCUMENT, node), None]
176 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000177 self.push(node)
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000178 # Put everything we have seen so far into the document
179 for e in self.pending_events:
180 if e[0][0] == PROCESSING_INSTRUCTION:
181 _,target,data = e[0]
182 n = self.document.createProcessingInstruction(target, data)
183 e[0] = (PROCESSING_INSTRUCTION, n)
184 elif e[0][0] == COMMENT:
185 n = self.document.createComment(e[0][1])
186 e[0] = (COMMENT, n)
187 else:
188 raise AssertionError("Unknown pending event ",e[0][0])
189 self.lastEvent[1] = e
190 self.lastEvent = e
191 self.pending_events = None
Martin v. Löwisb417be22001-02-06 01:16:06 +0000192 return node.firstChild
Fred Drake55c38192000-06-29 19:39:57 +0000193
Fred Drake1f549022000-09-24 05:21:58 +0000194 def endDocument(self):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000195 self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
196 self.pop()
Fred Drake55c38192000-06-29 19:39:57 +0000197
Martin v. Löwisb417be22001-02-06 01:16:06 +0000198 def clear(self):
199 "clear(): Explicitly release parsing structures"
200 self.document = None
201
Fred Drake55c38192000-06-29 19:39:57 +0000202class ErrorHandler:
Fred Drake1f549022000-09-24 05:21:58 +0000203 def warning(self, exception):
Fred Drake55c38192000-06-29 19:39:57 +0000204 print exception
Fred Drake1f549022000-09-24 05:21:58 +0000205 def error(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000206 raise exception
Fred Drake1f549022000-09-24 05:21:58 +0000207 def fatalError(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000208 raise exception
Fred Drake55c38192000-06-29 19:39:57 +0000209
210class DOMEventStream:
Fred Drake1f549022000-09-24 05:21:58 +0000211 def __init__(self, stream, parser, bufsize):
212 self.stream = stream
213 self.parser = parser
214 self.bufsize = bufsize
Fred Drake7fd173b2001-11-30 22:22:26 +0000215 if not hasattr(self.parser, 'feed'):
216 self.getEvent = self._slurp
Fred Drake55c38192000-06-29 19:39:57 +0000217 self.reset()
218
Fred Drake1f549022000-09-24 05:21:58 +0000219 def reset(self):
Fred Drake55c38192000-06-29 19:39:57 +0000220 self.pulldom = PullDOM()
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000221 # This content handler relies on namespace support
Fred Drakec16adce2000-12-14 18:00:18 +0000222 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
Fred Drake1f549022000-09-24 05:21:58 +0000223 self.parser.setContentHandler(self.pulldom)
Fred Drake55c38192000-06-29 19:39:57 +0000224
Fred Drake1f549022000-09-24 05:21:58 +0000225 def __getitem__(self, pos):
226 rc = self.getEvent()
227 if rc:
228 return rc
Fred Drake55c38192000-06-29 19:39:57 +0000229 raise IndexError
230
Andrew M. Kuchlingbdf1f192002-03-20 23:56:34 +0000231 def next(self):
232 rc = self.getEvent()
233 if rc:
234 return rc
235 raise StopIteration
236
237 def __iter__(self):
238 return self
239
Fred Drake1f549022000-09-24 05:21:58 +0000240 def expandNode(self, node):
241 event = self.getEvent()
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000242 parents = [node]
Fred Drake55c38192000-06-29 19:39:57 +0000243 while event:
Fred Drake1f549022000-09-24 05:21:58 +0000244 token, cur_node = event
245 if cur_node is node:
246 return
Lars Gustäbelec964d52000-10-13 20:53:27 +0000247 if token != END_ELEMENT:
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000248 parents[-1].appendChild(cur_node)
249 if token == START_ELEMENT:
250 parents.append(cur_node)
251 elif token == END_ELEMENT:
252 del parents[-1]
Fred Drake1f549022000-09-24 05:21:58 +0000253 event = self.getEvent()
Fred Drake55c38192000-06-29 19:39:57 +0000254
Fred Drake1f549022000-09-24 05:21:58 +0000255 def getEvent(self):
Fred Drake7fd173b2001-11-30 22:22:26 +0000256 # use IncrementalParser interface, so we get the desired
257 # pull effect
Fred Drake55c38192000-06-29 19:39:57 +0000258 if not self.pulldom.firstEvent[1]:
Fred Drake1f549022000-09-24 05:21:58 +0000259 self.pulldom.lastEvent = self.pulldom.firstEvent
Fred Drake55c38192000-06-29 19:39:57 +0000260 while not self.pulldom.firstEvent[1]:
Fred Drakec16adce2000-12-14 18:00:18 +0000261 buf = self.stream.read(self.bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000262 if not buf:
Martin v. Löwise3fc7222001-01-27 08:34:21 +0000263 self.parser.close()
Fred Drake55c38192000-06-29 19:39:57 +0000264 return None
Fred Drake1f549022000-09-24 05:21:58 +0000265 self.parser.feed(buf)
266 rc = self.pulldom.firstEvent[1][0]
267 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
Fred Drake55c38192000-06-29 19:39:57 +0000268 return rc
269
Fred Drake7fd173b2001-11-30 22:22:26 +0000270 def _slurp(self):
271 """ Fallback replacement for getEvent() using the
272 standard SAX2 interface, which means we slurp the
273 SAX events into memory (no performance gain, but
274 we are compatible to all SAX parsers).
275 """
276 self.parser.parse(self.stream)
277 self.getEvent = self._emit
278 return self._emit()
279
280 def _emit(self):
281 """ Fallback replacement for getEvent() that emits
282 the events that _slurp() read previously.
283 """
284 rc = self.pulldom.firstEvent[1][0]
285 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
286 return rc
287
Martin v. Löwisb417be22001-02-06 01:16:06 +0000288 def clear(self):
Fred Drake7fd173b2001-11-30 22:22:26 +0000289 """clear(): Explicitly release parsing objects"""
Martin v. Löwisb417be22001-02-06 01:16:06 +0000290 self.pulldom.clear()
291 del self.pulldom
292 self.parser = None
293 self.stream = None
294
Lars Gustäbelec964d52000-10-13 20:53:27 +0000295class SAX2DOM(PullDOM):
296
297 def startElementNS(self, name, tagName , attrs):
298 PullDOM.startElementNS(self, name, tagName, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000299 curNode = self.elementStack[-1]
300 parentNode = self.elementStack[-2]
301 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000302
303 def startElement(self, name, attrs):
304 PullDOM.startElement(self, name, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000305 curNode = self.elementStack[-1]
306 parentNode = self.elementStack[-2]
307 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000308
309 def processingInstruction(self, target, data):
310 PullDOM.processingInstruction(self, target, data)
311 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000312 parentNode = self.elementStack[-1]
313 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000314
315 def ignorableWhitespace(self, chars):
316 PullDOM.ignorableWhitespace(self, chars)
317 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000318 parentNode = self.elementStack[-1]
319 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000320
321 def characters(self, chars):
322 PullDOM.characters(self, chars)
323 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000324 parentNode = self.elementStack[-1]
325 parentNode.appendChild(node)
Fred Drake16f63292000-10-23 18:09:50 +0000326
Fred Drakec16adce2000-12-14 18:00:18 +0000327
Fred Drake1f549022000-09-24 05:21:58 +0000328default_bufsize = (2 ** 14) - 20
329
Fred Drakec16adce2000-12-14 18:00:18 +0000330def parse(stream_or_string, parser=None, bufsize=None):
331 if bufsize is None:
332 bufsize = default_bufsize
Martin v. Löwis011ea472000-12-28 18:43:02 +0000333 if type(stream_or_string) in _StringTypes:
Fred Drake1f549022000-09-24 05:21:58 +0000334 stream = open(stream_or_string)
Fred Drake55c38192000-06-29 19:39:57 +0000335 else:
Fred Drake1f549022000-09-24 05:21:58 +0000336 stream = stream_or_string
Fred Drake16f63292000-10-23 18:09:50 +0000337 if not parser:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000338 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000339 return DOMEventStream(stream, parser, bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000340
Fred Drake1f549022000-09-24 05:21:58 +0000341def parseString(string, parser=None):
Fred Drake55c38192000-06-29 19:39:57 +0000342 try:
Fred Drake1f549022000-09-24 05:21:58 +0000343 from cStringIO import StringIO
Fred Drake55c38192000-06-29 19:39:57 +0000344 except ImportError:
Fred Drake1f549022000-09-24 05:21:58 +0000345 from StringIO import StringIO
Fred Drake16f63292000-10-23 18:09:50 +0000346
Fred Drake1f549022000-09-24 05:21:58 +0000347 bufsize = len(string)
348 buf = StringIO(string)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000349 if not parser:
350 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000351 return DOMEventStream(buf, parser, bufsize)