blob: 255689c48871b158572ec799f044817a6b279d2e [file] [log] [blame]
Fred Drakec16adce2000-12-14 18:00:18 +00001import xml.sax
2import xml.sax.handler
Martin v. Löwis011ea472000-12-28 18:43:02 +00003import types
4
5try:
6 _StringTypes = [types.StringType, types.UnicodeType]
7except AttributeError:
8 _StringTypes = [types.StringType]
Fred Drake55c38192000-06-29 19:39:57 +00009
Fred Drake1f549022000-09-24 05:21:58 +000010START_ELEMENT = "START_ELEMENT"
11END_ELEMENT = "END_ELEMENT"
12COMMENT = "COMMENT"
13START_DOCUMENT = "START_DOCUMENT"
14END_DOCUMENT = "END_DOCUMENT"
15PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
16IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
17CHARACTERS = "CHARACTERS"
Fred Drake55c38192000-06-29 19:39:57 +000018
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000019class PullDOM(xml.sax.ContentHandler):
Fred Drakec16adce2000-12-14 18:00:18 +000020 _locator = None
21 document = None
22
23 def __init__(self, documentFactory=None):
24 self.documentFactory = documentFactory
Fred Drake1f549022000-09-24 05:21:58 +000025 self.firstEvent = [None, None]
26 self.lastEvent = self.firstEvent
Martin v. Löwis04a1a542001-01-26 18:53:42 +000027 self.elementStack = []
28 self.push = self.elementStack.append
29 try:
30 self.pop = self.elementStack.pop
31 except AttributeError:
32 # use class' pop instead
33 pass
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000034 self._ns_contexts = [{}] # contains uri -> prefix dicts
35 self._current_context = self._ns_contexts[-1]
Martin v. Löwis126f2f62001-03-13 10:50:13 +000036 self.pending_events = []
Fred Drake55c38192000-06-29 19:39:57 +000037
Martin v. Löwis04a1a542001-01-26 18:53:42 +000038 def pop(self):
39 result = self.elementStack[-1]
Martin v. Löwis52ce0d02001-01-27 08:47:37 +000040 del self.elementStack[-1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +000041 return result
42
Fred Drakec16adce2000-12-14 18:00:18 +000043 def setDocumentLocator(self, locator):
44 self._locator = locator
Fred Drake55c38192000-06-29 19:39:57 +000045
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000046 def startPrefixMapping(self, prefix, uri):
Martin v. Löwis984158d2001-07-18 15:30:25 +000047 if not hasattr(self, '_xmlns_attrs'):
48 self._xmlns_attrs = []
49 self._xmlns_attrs.append((prefix or 'xmlns', uri))
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000050 self._ns_contexts.append(self._current_context.copy())
Fred Drake7fd173b2001-11-30 22:22:26 +000051 self._current_context[uri] = prefix or None
Fred Drake55c38192000-06-29 19:39:57 +000052
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000053 def endPrefixMapping(self, prefix):
Fred Drakec16adce2000-12-14 18:00:18 +000054 self._current_context = self._ns_contexts.pop()
Fred Drake1f549022000-09-24 05:21:58 +000055
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000056 def startElementNS(self, name, tagName , attrs):
Martin v. Löwis984158d2001-07-18 15:30:25 +000057 # Retrieve xml namespace declaration attributes.
58 xmlns_uri = 'http://www.w3.org/2000/xmlns/'
59 xmlns_attrs = getattr(self, '_xmlns_attrs', None)
60 if xmlns_attrs is not None:
61 for aname, value in xmlns_attrs:
62 attrs._attrs[(xmlns_uri, aname)] = value
63 self._xmlns_attrs = []
Fred Drakec16adce2000-12-14 18:00:18 +000064 uri, localname = name
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000065 if uri:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000066 # When using namespaces, the reader may or may not
67 # provide us with the original name. If not, create
68 # *a* valid tagName from the current context.
69 if tagName is None:
Guido van Rossum795ad562001-02-05 18:50:15 +000070 prefix = self._current_context[uri]
71 if prefix:
72 tagName = prefix + ":" + localname
73 else:
74 tagName = localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000075 if self.document:
76 node = self.document.createElementNS(uri, tagName)
77 else:
78 node = self.buildDocument(uri, tagName)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000079 else:
80 # When the tagname is not prefixed, it just appears as
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000081 # localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000082 if self.document:
83 node = self.document.createElement(localname)
84 else:
85 node = self.buildDocument(None, localname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000086
87 for aname,value in attrs.items():
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000088 a_uri, a_localname = aname
Martin v. Löwis984158d2001-07-18 15:30:25 +000089 if a_uri == xmlns_uri:
90 if a_localname == 'xmlns':
91 qname = a_localname
92 else:
93 qname = 'xmlns:' + a_localname
94 attr = self.document.createAttributeNS(a_uri, qname)
95 node.setAttributeNodeNS(attr)
96 elif a_uri:
Guido van Rossum795ad562001-02-05 18:50:15 +000097 prefix = self._current_context[a_uri]
98 if prefix:
99 qname = prefix + ":" + a_localname
100 else:
101 qname = a_localname
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +0000102 attr = self.document.createAttributeNS(a_uri, qname)
Fred Drake6526bf82001-03-23 04:39:24 +0000103 node.setAttributeNodeNS(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000104 else:
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +0000105 attr = self.document.createAttribute(a_localname)
Fred Drake6526bf82001-03-23 04:39:24 +0000106 node.setAttributeNode(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000107 attr.value = value
Fred Drake16f63292000-10-23 18:09:50 +0000108
Fred Drake1f549022000-09-24 05:21:58 +0000109 self.lastEvent[1] = [(START_ELEMENT, node), None]
110 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000111 self.push(node)
Fred Drake55c38192000-06-29 19:39:57 +0000112
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000113 def endElementNS(self, name, tagName):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000114 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Fred Drake1f549022000-09-24 05:21:58 +0000115 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000116
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000117 def startElement(self, name, attrs):
Martin v. Löwisb417be22001-02-06 01:16:06 +0000118 if self.document:
119 node = self.document.createElement(name)
120 else:
121 node = self.buildDocument(None, name)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000122
123 for aname,value in attrs.items():
124 attr = self.document.createAttribute(aname)
125 attr.value = value
126 node.setAttributeNode(attr)
Fred Drake16f63292000-10-23 18:09:50 +0000127
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000128 self.lastEvent[1] = [(START_ELEMENT, node), None]
129 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000130 self.push(node)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000131
132 def endElement(self, name):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000133 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000134 self.lastEvent = self.lastEvent[1]
Fred Drake16f63292000-10-23 18:09:50 +0000135
Fred Drake1f549022000-09-24 05:21:58 +0000136 def comment(self, s):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000137 if self.document:
138 node = self.document.createComment(s)
139 self.lastEvent[1] = [(COMMENT, node), None]
140 self.lastEvent = self.lastEvent[1]
141 else:
142 event = [(COMMENT, s), None]
143 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000144
Fred Drake1f549022000-09-24 05:21:58 +0000145 def processingInstruction(self, target, data):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000146 if self.document:
147 node = self.document.createProcessingInstruction(target, data)
148 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
149 self.lastEvent = self.lastEvent[1]
150 else:
151 event = [(PROCESSING_INSTRUCTION, target, data), None]
152 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000153
Fred Drake1f549022000-09-24 05:21:58 +0000154 def ignorableWhitespace(self, chars):
Fred Drakec16adce2000-12-14 18:00:18 +0000155 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000156 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
157 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000158
Fred Drake1f549022000-09-24 05:21:58 +0000159 def characters(self, chars):
160 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000161 self.lastEvent[1] = [(CHARACTERS, node), None]
162 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000163
Fred Drake1f549022000-09-24 05:21:58 +0000164 def startDocument(self):
Fred Drakec16adce2000-12-14 18:00:18 +0000165 if self.documentFactory is None:
166 import xml.dom.minidom
167 self.documentFactory = xml.dom.minidom.Document.implementation
Martin v. Löwisb417be22001-02-06 01:16:06 +0000168
169 def buildDocument(self, uri, tagname):
170 # Can't do that in startDocument, since we need the tagname
171 # XXX: obtain DocumentType
172 node = self.documentFactory.createDocument(uri, tagname, None)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000173 self.document = node
Fred Drake1f549022000-09-24 05:21:58 +0000174 self.lastEvent[1] = [(START_DOCUMENT, node), None]
175 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000176 self.push(node)
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000177 # Put everything we have seen so far into the document
178 for e in self.pending_events:
179 if e[0][0] == PROCESSING_INSTRUCTION:
180 _,target,data = e[0]
181 n = self.document.createProcessingInstruction(target, data)
182 e[0] = (PROCESSING_INSTRUCTION, n)
183 elif e[0][0] == COMMENT:
184 n = self.document.createComment(e[0][1])
185 e[0] = (COMMENT, n)
186 else:
187 raise AssertionError("Unknown pending event ",e[0][0])
188 self.lastEvent[1] = e
189 self.lastEvent = e
190 self.pending_events = None
Martin v. Löwisb417be22001-02-06 01:16:06 +0000191 return node.firstChild
Fred Drake55c38192000-06-29 19:39:57 +0000192
Fred Drake1f549022000-09-24 05:21:58 +0000193 def endDocument(self):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000194 self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
195 self.pop()
Fred Drake55c38192000-06-29 19:39:57 +0000196
Martin v. Löwisb417be22001-02-06 01:16:06 +0000197 def clear(self):
198 "clear(): Explicitly release parsing structures"
199 self.document = None
200
Fred Drake55c38192000-06-29 19:39:57 +0000201class ErrorHandler:
Fred Drake1f549022000-09-24 05:21:58 +0000202 def warning(self, exception):
Fred Drake55c38192000-06-29 19:39:57 +0000203 print exception
Fred Drake1f549022000-09-24 05:21:58 +0000204 def error(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000205 raise exception
Fred Drake1f549022000-09-24 05:21:58 +0000206 def fatalError(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000207 raise exception
Fred Drake55c38192000-06-29 19:39:57 +0000208
209class DOMEventStream:
Fred Drake1f549022000-09-24 05:21:58 +0000210 def __init__(self, stream, parser, bufsize):
211 self.stream = stream
212 self.parser = parser
213 self.bufsize = bufsize
Fred Drake7fd173b2001-11-30 22:22:26 +0000214 if not hasattr(self.parser, 'feed'):
215 self.getEvent = self._slurp
Fred Drake55c38192000-06-29 19:39:57 +0000216 self.reset()
217
Fred Drake1f549022000-09-24 05:21:58 +0000218 def reset(self):
Fred Drake55c38192000-06-29 19:39:57 +0000219 self.pulldom = PullDOM()
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000220 # This content handler relies on namespace support
Fred Drakec16adce2000-12-14 18:00:18 +0000221 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
Fred Drake1f549022000-09-24 05:21:58 +0000222 self.parser.setContentHandler(self.pulldom)
Fred Drake55c38192000-06-29 19:39:57 +0000223
Fred Drake1f549022000-09-24 05:21:58 +0000224 def __getitem__(self, pos):
225 rc = self.getEvent()
226 if rc:
227 return rc
Fred Drake55c38192000-06-29 19:39:57 +0000228 raise IndexError
229
Andrew M. Kuchlingbdf1f192002-03-20 23:56:34 +0000230 def next(self):
231 rc = self.getEvent()
232 if rc:
233 return rc
234 raise StopIteration
235
236 def __iter__(self):
237 return self
238
Fred Drake1f549022000-09-24 05:21:58 +0000239 def expandNode(self, node):
240 event = self.getEvent()
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000241 parents = [node]
Fred Drake55c38192000-06-29 19:39:57 +0000242 while event:
Fred Drake1f549022000-09-24 05:21:58 +0000243 token, cur_node = event
244 if cur_node is node:
245 return
Lars Gustäbelec964d52000-10-13 20:53:27 +0000246 if token != END_ELEMENT:
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000247 parents[-1].appendChild(cur_node)
248 if token == START_ELEMENT:
249 parents.append(cur_node)
250 elif token == END_ELEMENT:
251 del parents[-1]
Fred Drake1f549022000-09-24 05:21:58 +0000252 event = self.getEvent()
Fred Drake55c38192000-06-29 19:39:57 +0000253
Fred Drake1f549022000-09-24 05:21:58 +0000254 def getEvent(self):
Fred Drake7fd173b2001-11-30 22:22:26 +0000255 # use IncrementalParser interface, so we get the desired
256 # pull effect
Fred Drake55c38192000-06-29 19:39:57 +0000257 if not self.pulldom.firstEvent[1]:
Fred Drake1f549022000-09-24 05:21:58 +0000258 self.pulldom.lastEvent = self.pulldom.firstEvent
Fred Drake55c38192000-06-29 19:39:57 +0000259 while not self.pulldom.firstEvent[1]:
Fred Drakec16adce2000-12-14 18:00:18 +0000260 buf = self.stream.read(self.bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000261 if not buf:
Martin v. Löwise3fc7222001-01-27 08:34:21 +0000262 self.parser.close()
Fred Drake55c38192000-06-29 19:39:57 +0000263 return None
Fred Drake1f549022000-09-24 05:21:58 +0000264 self.parser.feed(buf)
265 rc = self.pulldom.firstEvent[1][0]
266 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
Fred Drake55c38192000-06-29 19:39:57 +0000267 return rc
268
Fred Drake7fd173b2001-11-30 22:22:26 +0000269 def _slurp(self):
270 """ Fallback replacement for getEvent() using the
271 standard SAX2 interface, which means we slurp the
272 SAX events into memory (no performance gain, but
273 we are compatible to all SAX parsers).
274 """
275 self.parser.parse(self.stream)
276 self.getEvent = self._emit
277 return self._emit()
278
279 def _emit(self):
280 """ Fallback replacement for getEvent() that emits
281 the events that _slurp() read previously.
282 """
283 rc = self.pulldom.firstEvent[1][0]
284 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
285 return rc
286
Martin v. Löwisb417be22001-02-06 01:16:06 +0000287 def clear(self):
Fred Drake7fd173b2001-11-30 22:22:26 +0000288 """clear(): Explicitly release parsing objects"""
Martin v. Löwisb417be22001-02-06 01:16:06 +0000289 self.pulldom.clear()
290 del self.pulldom
291 self.parser = None
292 self.stream = None
293
Lars Gustäbelec964d52000-10-13 20:53:27 +0000294class SAX2DOM(PullDOM):
295
296 def startElementNS(self, name, tagName , attrs):
297 PullDOM.startElementNS(self, name, tagName, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000298 curNode = self.elementStack[-1]
299 parentNode = self.elementStack[-2]
300 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000301
302 def startElement(self, name, attrs):
303 PullDOM.startElement(self, name, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000304 curNode = self.elementStack[-1]
305 parentNode = self.elementStack[-2]
306 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000307
308 def processingInstruction(self, target, data):
309 PullDOM.processingInstruction(self, target, data)
310 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000311 parentNode = self.elementStack[-1]
312 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000313
314 def ignorableWhitespace(self, chars):
315 PullDOM.ignorableWhitespace(self, chars)
316 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000317 parentNode = self.elementStack[-1]
318 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000319
320 def characters(self, chars):
321 PullDOM.characters(self, chars)
322 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000323 parentNode = self.elementStack[-1]
324 parentNode.appendChild(node)
Fred Drake16f63292000-10-23 18:09:50 +0000325
Fred Drakec16adce2000-12-14 18:00:18 +0000326
Fred Drake1f549022000-09-24 05:21:58 +0000327default_bufsize = (2 ** 14) - 20
328
Fred Drakec16adce2000-12-14 18:00:18 +0000329def parse(stream_or_string, parser=None, bufsize=None):
330 if bufsize is None:
331 bufsize = default_bufsize
Martin v. Löwis011ea472000-12-28 18:43:02 +0000332 if type(stream_or_string) in _StringTypes:
Fred Drake1f549022000-09-24 05:21:58 +0000333 stream = open(stream_or_string)
Fred Drake55c38192000-06-29 19:39:57 +0000334 else:
Fred Drake1f549022000-09-24 05:21:58 +0000335 stream = stream_or_string
Fred Drake16f63292000-10-23 18:09:50 +0000336 if not parser:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000337 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000338 return DOMEventStream(stream, parser, bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000339
Fred Drake1f549022000-09-24 05:21:58 +0000340def parseString(string, parser=None):
Fred Drake55c38192000-06-29 19:39:57 +0000341 try:
Fred Drake1f549022000-09-24 05:21:58 +0000342 from cStringIO import StringIO
Fred Drake55c38192000-06-29 19:39:57 +0000343 except ImportError:
Fred Drake1f549022000-09-24 05:21:58 +0000344 from StringIO import StringIO
Fred Drake16f63292000-10-23 18:09:50 +0000345
Fred Drake1f549022000-09-24 05:21:58 +0000346 bufsize = len(string)
347 buf = StringIO(string)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000348 if not parser:
349 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000350 return DOMEventStream(buf, parser, bufsize)