blob: adc978e9a995ad96377e83487c2756e9fba986aa [file] [log] [blame]
Fred Drakec16adce2000-12-14 18:00:18 +00001import xml.sax
2import xml.sax.handler
Martin v. Löwis011ea472000-12-28 18:43:02 +00003import types
4
5try:
6 _StringTypes = [types.StringType, types.UnicodeType]
7except AttributeError:
8 _StringTypes = [types.StringType]
Fred Drake55c38192000-06-29 19:39:57 +00009
Fred Drake1f549022000-09-24 05:21:58 +000010START_ELEMENT = "START_ELEMENT"
11END_ELEMENT = "END_ELEMENT"
12COMMENT = "COMMENT"
13START_DOCUMENT = "START_DOCUMENT"
14END_DOCUMENT = "END_DOCUMENT"
15PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
16IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
17CHARACTERS = "CHARACTERS"
Fred Drake55c38192000-06-29 19:39:57 +000018
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000019class PullDOM(xml.sax.ContentHandler):
Fred Drakec16adce2000-12-14 18:00:18 +000020 _locator = None
21 document = None
22
23 def __init__(self, documentFactory=None):
24 self.documentFactory = documentFactory
Fred Drake1f549022000-09-24 05:21:58 +000025 self.firstEvent = [None, None]
26 self.lastEvent = self.firstEvent
Martin v. Löwis04a1a542001-01-26 18:53:42 +000027 self.elementStack = []
28 self.push = self.elementStack.append
29 try:
30 self.pop = self.elementStack.pop
31 except AttributeError:
32 # use class' pop instead
33 pass
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000034 self._ns_contexts = [{}] # contains uri -> prefix dicts
35 self._current_context = self._ns_contexts[-1]
Martin v. Löwis126f2f62001-03-13 10:50:13 +000036 self.pending_events = []
Fred Drake55c38192000-06-29 19:39:57 +000037
Martin v. Löwis04a1a542001-01-26 18:53:42 +000038 def pop(self):
39 result = self.elementStack[-1]
Martin v. Löwis52ce0d02001-01-27 08:47:37 +000040 del self.elementStack[-1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +000041 return result
42
Fred Drakec16adce2000-12-14 18:00:18 +000043 def setDocumentLocator(self, locator):
44 self._locator = locator
Fred Drake55c38192000-06-29 19:39:57 +000045
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000046 def startPrefixMapping(self, prefix, uri):
Martin v. Löwis984158d2001-07-18 15:30:25 +000047 if not hasattr(self, '_xmlns_attrs'):
48 self._xmlns_attrs = []
49 self._xmlns_attrs.append((prefix or 'xmlns', uri))
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000050 self._ns_contexts.append(self._current_context.copy())
Fred Drake7fd173b2001-11-30 22:22:26 +000051 self._current_context[uri] = prefix or None
Fred Drake55c38192000-06-29 19:39:57 +000052
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000053 def endPrefixMapping(self, prefix):
Fred Drakec16adce2000-12-14 18:00:18 +000054 self._current_context = self._ns_contexts.pop()
Fred Drake1f549022000-09-24 05:21:58 +000055
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000056 def startElementNS(self, name, tagName , attrs):
Martin v. Löwis984158d2001-07-18 15:30:25 +000057 # Retrieve xml namespace declaration attributes.
58 xmlns_uri = 'http://www.w3.org/2000/xmlns/'
59 xmlns_attrs = getattr(self, '_xmlns_attrs', None)
60 if xmlns_attrs is not None:
61 for aname, value in xmlns_attrs:
62 attrs._attrs[(xmlns_uri, aname)] = value
63 self._xmlns_attrs = []
Fred Drakec16adce2000-12-14 18:00:18 +000064 uri, localname = name
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000065 if uri:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000066 # When using namespaces, the reader may or may not
67 # provide us with the original name. If not, create
68 # *a* valid tagName from the current context.
69 if tagName is None:
Guido van Rossum795ad562001-02-05 18:50:15 +000070 prefix = self._current_context[uri]
71 if prefix:
72 tagName = prefix + ":" + localname
73 else:
74 tagName = localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000075 if self.document:
76 node = self.document.createElementNS(uri, tagName)
77 else:
78 node = self.buildDocument(uri, tagName)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000079 else:
80 # When the tagname is not prefixed, it just appears as
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000081 # localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000082 if self.document:
83 node = self.document.createElement(localname)
84 else:
85 node = self.buildDocument(None, localname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000086
87 for aname,value in attrs.items():
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000088 a_uri, a_localname = aname
Martin v. Löwis984158d2001-07-18 15:30:25 +000089 if a_uri == xmlns_uri:
90 if a_localname == 'xmlns':
91 qname = a_localname
92 else:
93 qname = 'xmlns:' + a_localname
94 attr = self.document.createAttributeNS(a_uri, qname)
95 node.setAttributeNodeNS(attr)
96 elif a_uri:
Guido van Rossum795ad562001-02-05 18:50:15 +000097 prefix = self._current_context[a_uri]
98 if prefix:
99 qname = prefix + ":" + a_localname
100 else:
101 qname = a_localname
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +0000102 attr = self.document.createAttributeNS(a_uri, qname)
Fred Drake6526bf82001-03-23 04:39:24 +0000103 node.setAttributeNodeNS(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000104 else:
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +0000105 attr = self.document.createAttribute(a_localname)
Fred Drake6526bf82001-03-23 04:39:24 +0000106 node.setAttributeNode(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000107 attr.value = value
Fred Drake16f63292000-10-23 18:09:50 +0000108
Fred Drake1f549022000-09-24 05:21:58 +0000109 self.lastEvent[1] = [(START_ELEMENT, node), None]
110 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000111 self.push(node)
Fred Drake55c38192000-06-29 19:39:57 +0000112
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000113 def endElementNS(self, name, tagName):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000114 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Fred Drake1f549022000-09-24 05:21:58 +0000115 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000116
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000117 def startElement(self, name, attrs):
Martin v. Löwisb417be22001-02-06 01:16:06 +0000118 if self.document:
119 node = self.document.createElement(name)
120 else:
121 node = self.buildDocument(None, name)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000122
123 for aname,value in attrs.items():
124 attr = self.document.createAttribute(aname)
125 attr.value = value
126 node.setAttributeNode(attr)
Fred Drake16f63292000-10-23 18:09:50 +0000127
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000128 self.lastEvent[1] = [(START_ELEMENT, node), None]
129 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000130 self.push(node)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000131
132 def endElement(self, name):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000133 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000134 self.lastEvent = self.lastEvent[1]
Fred Drake16f63292000-10-23 18:09:50 +0000135
Fred Drake1f549022000-09-24 05:21:58 +0000136 def comment(self, s):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000137 if self.document:
138 node = self.document.createComment(s)
139 self.lastEvent[1] = [(COMMENT, node), None]
140 self.lastEvent = self.lastEvent[1]
141 else:
142 event = [(COMMENT, s), None]
143 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000144
Fred Drake1f549022000-09-24 05:21:58 +0000145 def processingInstruction(self, target, data):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000146 if self.document:
147 node = self.document.createProcessingInstruction(target, data)
148 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
149 self.lastEvent = self.lastEvent[1]
150 else:
151 event = [(PROCESSING_INSTRUCTION, target, data), None]
152 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000153
Fred Drake1f549022000-09-24 05:21:58 +0000154 def ignorableWhitespace(self, chars):
Fred Drakec16adce2000-12-14 18:00:18 +0000155 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000156 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
157 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000158
Fred Drake1f549022000-09-24 05:21:58 +0000159 def characters(self, chars):
160 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000161 self.lastEvent[1] = [(CHARACTERS, node), None]
162 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000163
Fred Drake1f549022000-09-24 05:21:58 +0000164 def startDocument(self):
Fred Drakec16adce2000-12-14 18:00:18 +0000165 if self.documentFactory is None:
166 import xml.dom.minidom
167 self.documentFactory = xml.dom.minidom.Document.implementation
Martin v. Löwisb417be22001-02-06 01:16:06 +0000168
169 def buildDocument(self, uri, tagname):
170 # Can't do that in startDocument, since we need the tagname
171 # XXX: obtain DocumentType
172 node = self.documentFactory.createDocument(uri, tagname, None)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000173 self.document = node
Fred Drake1f549022000-09-24 05:21:58 +0000174 self.lastEvent[1] = [(START_DOCUMENT, node), None]
175 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000176 self.push(node)
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000177 # Put everything we have seen so far into the document
178 for e in self.pending_events:
179 if e[0][0] == PROCESSING_INSTRUCTION:
180 _,target,data = e[0]
181 n = self.document.createProcessingInstruction(target, data)
182 e[0] = (PROCESSING_INSTRUCTION, n)
183 elif e[0][0] == COMMENT:
184 n = self.document.createComment(e[0][1])
185 e[0] = (COMMENT, n)
186 else:
187 raise AssertionError("Unknown pending event ",e[0][0])
188 self.lastEvent[1] = e
189 self.lastEvent = e
190 self.pending_events = None
Martin v. Löwisb417be22001-02-06 01:16:06 +0000191 return node.firstChild
Fred Drake55c38192000-06-29 19:39:57 +0000192
Fred Drake1f549022000-09-24 05:21:58 +0000193 def endDocument(self):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000194 self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
195 self.pop()
Fred Drake55c38192000-06-29 19:39:57 +0000196
Martin v. Löwisb417be22001-02-06 01:16:06 +0000197 def clear(self):
198 "clear(): Explicitly release parsing structures"
199 self.document = None
200
Fred Drake55c38192000-06-29 19:39:57 +0000201class ErrorHandler:
Fred Drake1f549022000-09-24 05:21:58 +0000202 def warning(self, exception):
Fred Drake55c38192000-06-29 19:39:57 +0000203 print exception
Fred Drake1f549022000-09-24 05:21:58 +0000204 def error(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000205 raise exception
Fred Drake1f549022000-09-24 05:21:58 +0000206 def fatalError(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000207 raise exception
Fred Drake55c38192000-06-29 19:39:57 +0000208
209class DOMEventStream:
Fred Drake1f549022000-09-24 05:21:58 +0000210 def __init__(self, stream, parser, bufsize):
211 self.stream = stream
212 self.parser = parser
213 self.bufsize = bufsize
Fred Drake7fd173b2001-11-30 22:22:26 +0000214 if not hasattr(self.parser, 'feed'):
215 self.getEvent = self._slurp
Fred Drake55c38192000-06-29 19:39:57 +0000216 self.reset()
217
Fred Drake1f549022000-09-24 05:21:58 +0000218 def reset(self):
Fred Drake55c38192000-06-29 19:39:57 +0000219 self.pulldom = PullDOM()
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000220 # This content handler relies on namespace support
Fred Drakec16adce2000-12-14 18:00:18 +0000221 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
Fred Drake1f549022000-09-24 05:21:58 +0000222 self.parser.setContentHandler(self.pulldom)
Fred Drake55c38192000-06-29 19:39:57 +0000223
Fred Drake1f549022000-09-24 05:21:58 +0000224 def __getitem__(self, pos):
225 rc = self.getEvent()
226 if rc:
227 return rc
Fred Drake55c38192000-06-29 19:39:57 +0000228 raise IndexError
229
Fred Drake1f549022000-09-24 05:21:58 +0000230 def expandNode(self, node):
231 event = self.getEvent()
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000232 parents = [node]
Fred Drake55c38192000-06-29 19:39:57 +0000233 while event:
Fred Drake1f549022000-09-24 05:21:58 +0000234 token, cur_node = event
235 if cur_node is node:
236 return
Lars Gustäbelec964d52000-10-13 20:53:27 +0000237 if token != END_ELEMENT:
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000238 parents[-1].appendChild(cur_node)
239 if token == START_ELEMENT:
240 parents.append(cur_node)
241 elif token == END_ELEMENT:
242 del parents[-1]
Fred Drake1f549022000-09-24 05:21:58 +0000243 event = self.getEvent()
Fred Drake55c38192000-06-29 19:39:57 +0000244
Fred Drake1f549022000-09-24 05:21:58 +0000245 def getEvent(self):
Fred Drake7fd173b2001-11-30 22:22:26 +0000246 # use IncrementalParser interface, so we get the desired
247 # pull effect
Fred Drake55c38192000-06-29 19:39:57 +0000248 if not self.pulldom.firstEvent[1]:
Fred Drake1f549022000-09-24 05:21:58 +0000249 self.pulldom.lastEvent = self.pulldom.firstEvent
Fred Drake55c38192000-06-29 19:39:57 +0000250 while not self.pulldom.firstEvent[1]:
Fred Drakec16adce2000-12-14 18:00:18 +0000251 buf = self.stream.read(self.bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000252 if not buf:
Martin v. Löwise3fc7222001-01-27 08:34:21 +0000253 self.parser.close()
Fred Drake55c38192000-06-29 19:39:57 +0000254 return None
Fred Drake1f549022000-09-24 05:21:58 +0000255 self.parser.feed(buf)
256 rc = self.pulldom.firstEvent[1][0]
257 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
Fred Drake55c38192000-06-29 19:39:57 +0000258 return rc
259
Fred Drake7fd173b2001-11-30 22:22:26 +0000260 def _slurp(self):
261 """ Fallback replacement for getEvent() using the
262 standard SAX2 interface, which means we slurp the
263 SAX events into memory (no performance gain, but
264 we are compatible to all SAX parsers).
265 """
266 self.parser.parse(self.stream)
267 self.getEvent = self._emit
268 return self._emit()
269
270 def _emit(self):
271 """ Fallback replacement for getEvent() that emits
272 the events that _slurp() read previously.
273 """
274 rc = self.pulldom.firstEvent[1][0]
275 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
276 return rc
277
Martin v. Löwisb417be22001-02-06 01:16:06 +0000278 def clear(self):
Fred Drake7fd173b2001-11-30 22:22:26 +0000279 """clear(): Explicitly release parsing objects"""
Martin v. Löwisb417be22001-02-06 01:16:06 +0000280 self.pulldom.clear()
281 del self.pulldom
282 self.parser = None
283 self.stream = None
284
Lars Gustäbelec964d52000-10-13 20:53:27 +0000285class SAX2DOM(PullDOM):
286
287 def startElementNS(self, name, tagName , attrs):
288 PullDOM.startElementNS(self, name, tagName, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000289 curNode = self.elementStack[-1]
290 parentNode = self.elementStack[-2]
291 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000292
293 def startElement(self, name, attrs):
294 PullDOM.startElement(self, name, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000295 curNode = self.elementStack[-1]
296 parentNode = self.elementStack[-2]
297 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000298
299 def processingInstruction(self, target, data):
300 PullDOM.processingInstruction(self, target, data)
301 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000302 parentNode = self.elementStack[-1]
303 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000304
305 def ignorableWhitespace(self, chars):
306 PullDOM.ignorableWhitespace(self, chars)
307 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000308 parentNode = self.elementStack[-1]
309 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000310
311 def characters(self, chars):
312 PullDOM.characters(self, chars)
313 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000314 parentNode = self.elementStack[-1]
315 parentNode.appendChild(node)
Fred Drake16f63292000-10-23 18:09:50 +0000316
Fred Drakec16adce2000-12-14 18:00:18 +0000317
Fred Drake1f549022000-09-24 05:21:58 +0000318default_bufsize = (2 ** 14) - 20
319
Fred Drakec16adce2000-12-14 18:00:18 +0000320def parse(stream_or_string, parser=None, bufsize=None):
321 if bufsize is None:
322 bufsize = default_bufsize
Martin v. Löwis011ea472000-12-28 18:43:02 +0000323 if type(stream_or_string) in _StringTypes:
Fred Drake1f549022000-09-24 05:21:58 +0000324 stream = open(stream_or_string)
Fred Drake55c38192000-06-29 19:39:57 +0000325 else:
Fred Drake1f549022000-09-24 05:21:58 +0000326 stream = stream_or_string
Fred Drake16f63292000-10-23 18:09:50 +0000327 if not parser:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000328 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000329 return DOMEventStream(stream, parser, bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000330
Fred Drake1f549022000-09-24 05:21:58 +0000331def parseString(string, parser=None):
Fred Drake55c38192000-06-29 19:39:57 +0000332 try:
Fred Drake1f549022000-09-24 05:21:58 +0000333 from cStringIO import StringIO
Fred Drake55c38192000-06-29 19:39:57 +0000334 except ImportError:
Fred Drake1f549022000-09-24 05:21:58 +0000335 from StringIO import StringIO
Fred Drake16f63292000-10-23 18:09:50 +0000336
Fred Drake1f549022000-09-24 05:21:58 +0000337 bufsize = len(string)
338 buf = StringIO(string)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000339 if not parser:
340 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000341 return DOMEventStream(buf, parser, bufsize)