blob: c12c992a6856722e93aefc311d77c20eda63debe [file] [log] [blame]
Fred Drakec16adce2000-12-14 18:00:18 +00001import xml.sax
2import xml.sax.handler
Martin v. Löwis011ea472000-12-28 18:43:02 +00003import types
4
5try:
6 _StringTypes = [types.StringType, types.UnicodeType]
7except AttributeError:
8 _StringTypes = [types.StringType]
Fred Drake55c38192000-06-29 19:39:57 +00009
Fred Drake1f549022000-09-24 05:21:58 +000010START_ELEMENT = "START_ELEMENT"
11END_ELEMENT = "END_ELEMENT"
12COMMENT = "COMMENT"
13START_DOCUMENT = "START_DOCUMENT"
14END_DOCUMENT = "END_DOCUMENT"
15PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
16IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
17CHARACTERS = "CHARACTERS"
Fred Drake55c38192000-06-29 19:39:57 +000018
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000019class PullDOM(xml.sax.ContentHandler):
Fred Drakec16adce2000-12-14 18:00:18 +000020 _locator = None
21 document = None
22
23 def __init__(self, documentFactory=None):
24 self.documentFactory = documentFactory
Fred Drake1f549022000-09-24 05:21:58 +000025 self.firstEvent = [None, None]
26 self.lastEvent = self.firstEvent
Martin v. Löwis04a1a542001-01-26 18:53:42 +000027 self.elementStack = []
28 self.push = self.elementStack.append
29 try:
30 self.pop = self.elementStack.pop
31 except AttributeError:
32 # use class' pop instead
33 pass
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000034 self._ns_contexts = [{}] # contains uri -> prefix dicts
35 self._current_context = self._ns_contexts[-1]
Martin v. Löwis126f2f62001-03-13 10:50:13 +000036 self.pending_events = []
Fred Drake55c38192000-06-29 19:39:57 +000037
Martin v. Löwis04a1a542001-01-26 18:53:42 +000038 def pop(self):
39 result = self.elementStack[-1]
Martin v. Löwis52ce0d02001-01-27 08:47:37 +000040 del self.elementStack[-1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +000041 return result
42
Fred Drakec16adce2000-12-14 18:00:18 +000043 def setDocumentLocator(self, locator):
44 self._locator = locator
Fred Drake55c38192000-06-29 19:39:57 +000045
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000046 def startPrefixMapping(self, prefix, uri):
Martin v. Löwis984158d2001-07-18 15:30:25 +000047 if not hasattr(self, '_xmlns_attrs'):
48 self._xmlns_attrs = []
49 self._xmlns_attrs.append((prefix or 'xmlns', uri))
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000050 self._ns_contexts.append(self._current_context.copy())
Fred Drakec16adce2000-12-14 18:00:18 +000051 self._current_context[uri] = prefix or ''
Fred Drake55c38192000-06-29 19:39:57 +000052
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000053 def endPrefixMapping(self, prefix):
Fred Drakec16adce2000-12-14 18:00:18 +000054 self._current_context = self._ns_contexts.pop()
Fred Drake1f549022000-09-24 05:21:58 +000055
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000056 def startElementNS(self, name, tagName , attrs):
Martin v. Löwis984158d2001-07-18 15:30:25 +000057 # Retrieve xml namespace declaration attributes.
58 xmlns_uri = 'http://www.w3.org/2000/xmlns/'
59 xmlns_attrs = getattr(self, '_xmlns_attrs', None)
60 if xmlns_attrs is not None:
61 for aname, value in xmlns_attrs:
62 attrs._attrs[(xmlns_uri, aname)] = value
63 self._xmlns_attrs = []
Fred Drakec16adce2000-12-14 18:00:18 +000064 uri, localname = name
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000065 if uri:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000066 # When using namespaces, the reader may or may not
67 # provide us with the original name. If not, create
68 # *a* valid tagName from the current context.
69 if tagName is None:
Guido van Rossum795ad562001-02-05 18:50:15 +000070 prefix = self._current_context[uri]
71 if prefix:
72 tagName = prefix + ":" + localname
73 else:
74 tagName = localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000075 if self.document:
76 node = self.document.createElementNS(uri, tagName)
77 else:
78 node = self.buildDocument(uri, tagName)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000079 else:
80 # When the tagname is not prefixed, it just appears as
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000081 # localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000082 if self.document:
83 node = self.document.createElement(localname)
84 else:
85 node = self.buildDocument(None, localname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000086
87 for aname,value in attrs.items():
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000088 a_uri, a_localname = aname
Martin v. Löwis984158d2001-07-18 15:30:25 +000089 if a_uri == xmlns_uri:
90 if a_localname == 'xmlns':
91 qname = a_localname
92 else:
93 qname = 'xmlns:' + a_localname
94 attr = self.document.createAttributeNS(a_uri, qname)
95 node.setAttributeNodeNS(attr)
96 elif a_uri:
Guido van Rossum795ad562001-02-05 18:50:15 +000097 prefix = self._current_context[a_uri]
98 if prefix:
99 qname = prefix + ":" + a_localname
100 else:
101 qname = a_localname
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +0000102 attr = self.document.createAttributeNS(a_uri, qname)
Fred Drake6526bf82001-03-23 04:39:24 +0000103 node.setAttributeNodeNS(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000104 else:
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +0000105 attr = self.document.createAttribute(a_localname)
Fred Drake6526bf82001-03-23 04:39:24 +0000106 node.setAttributeNode(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000107 attr.value = value
Fred Drake16f63292000-10-23 18:09:50 +0000108
Fred Drake1f549022000-09-24 05:21:58 +0000109 self.lastEvent[1] = [(START_ELEMENT, node), None]
110 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000111 self.push(node)
Fred Drake55c38192000-06-29 19:39:57 +0000112
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000113 def endElementNS(self, name, tagName):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000114 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Fred Drake1f549022000-09-24 05:21:58 +0000115 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000116
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000117 def startElement(self, name, attrs):
Martin v. Löwisb417be22001-02-06 01:16:06 +0000118 if self.document:
119 node = self.document.createElement(name)
120 else:
121 node = self.buildDocument(None, name)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000122
123 for aname,value in attrs.items():
124 attr = self.document.createAttribute(aname)
125 attr.value = value
126 node.setAttributeNode(attr)
Fred Drake16f63292000-10-23 18:09:50 +0000127
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000128 self.lastEvent[1] = [(START_ELEMENT, node), None]
129 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000130 self.push(node)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000131
132 def endElement(self, name):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000133 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000134 self.lastEvent = self.lastEvent[1]
Fred Drake16f63292000-10-23 18:09:50 +0000135
Fred Drake1f549022000-09-24 05:21:58 +0000136 def comment(self, s):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000137 if self.document:
138 node = self.document.createComment(s)
139 self.lastEvent[1] = [(COMMENT, node), None]
140 self.lastEvent = self.lastEvent[1]
141 else:
142 event = [(COMMENT, s), None]
143 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000144
Fred Drake1f549022000-09-24 05:21:58 +0000145 def processingInstruction(self, target, data):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000146 if self.document:
147 node = self.document.createProcessingInstruction(target, data)
148 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
149 self.lastEvent = self.lastEvent[1]
150 else:
151 event = [(PROCESSING_INSTRUCTION, target, data), None]
152 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000153
Fred Drake1f549022000-09-24 05:21:58 +0000154 def ignorableWhitespace(self, chars):
Fred Drakec16adce2000-12-14 18:00:18 +0000155 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000156 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
157 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000158
Fred Drake1f549022000-09-24 05:21:58 +0000159 def characters(self, chars):
160 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000161 self.lastEvent[1] = [(CHARACTERS, node), None]
162 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000163
Fred Drake1f549022000-09-24 05:21:58 +0000164 def startDocument(self):
Fred Drakec16adce2000-12-14 18:00:18 +0000165 if self.documentFactory is None:
166 import xml.dom.minidom
167 self.documentFactory = xml.dom.minidom.Document.implementation
Martin v. Löwisb417be22001-02-06 01:16:06 +0000168
169 def buildDocument(self, uri, tagname):
170 # Can't do that in startDocument, since we need the tagname
171 # XXX: obtain DocumentType
172 node = self.documentFactory.createDocument(uri, tagname, None)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000173 self.document = node
Fred Drake1f549022000-09-24 05:21:58 +0000174 self.lastEvent[1] = [(START_DOCUMENT, node), None]
175 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000176 self.push(node)
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000177 # Put everything we have seen so far into the document
178 for e in self.pending_events:
179 if e[0][0] == PROCESSING_INSTRUCTION:
180 _,target,data = e[0]
181 n = self.document.createProcessingInstruction(target, data)
182 e[0] = (PROCESSING_INSTRUCTION, n)
183 elif e[0][0] == COMMENT:
184 n = self.document.createComment(e[0][1])
185 e[0] = (COMMENT, n)
186 else:
187 raise AssertionError("Unknown pending event ",e[0][0])
188 self.lastEvent[1] = e
189 self.lastEvent = e
190 self.pending_events = None
Martin v. Löwisb417be22001-02-06 01:16:06 +0000191 return node.firstChild
Fred Drake55c38192000-06-29 19:39:57 +0000192
Fred Drake1f549022000-09-24 05:21:58 +0000193 def endDocument(self):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000194 self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
195 self.pop()
Fred Drake55c38192000-06-29 19:39:57 +0000196
Martin v. Löwisb417be22001-02-06 01:16:06 +0000197 def clear(self):
198 "clear(): Explicitly release parsing structures"
199 self.document = None
200
Fred Drake55c38192000-06-29 19:39:57 +0000201class ErrorHandler:
Fred Drake1f549022000-09-24 05:21:58 +0000202 def warning(self, exception):
Fred Drake55c38192000-06-29 19:39:57 +0000203 print exception
Fred Drake1f549022000-09-24 05:21:58 +0000204 def error(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000205 raise exception
Fred Drake1f549022000-09-24 05:21:58 +0000206 def fatalError(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000207 raise exception
Fred Drake55c38192000-06-29 19:39:57 +0000208
209class DOMEventStream:
Fred Drake1f549022000-09-24 05:21:58 +0000210 def __init__(self, stream, parser, bufsize):
211 self.stream = stream
212 self.parser = parser
213 self.bufsize = bufsize
Fred Drake55c38192000-06-29 19:39:57 +0000214 self.reset()
215
Fred Drake1f549022000-09-24 05:21:58 +0000216 def reset(self):
Fred Drake55c38192000-06-29 19:39:57 +0000217 self.pulldom = PullDOM()
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000218 # This content handler relies on namespace support
Fred Drakec16adce2000-12-14 18:00:18 +0000219 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
Fred Drake1f549022000-09-24 05:21:58 +0000220 self.parser.setContentHandler(self.pulldom)
Fred Drake55c38192000-06-29 19:39:57 +0000221
Fred Drake1f549022000-09-24 05:21:58 +0000222 def __getitem__(self, pos):
223 rc = self.getEvent()
224 if rc:
225 return rc
Fred Drake55c38192000-06-29 19:39:57 +0000226 raise IndexError
227
Fred Drake1f549022000-09-24 05:21:58 +0000228 def expandNode(self, node):
229 event = self.getEvent()
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000230 parents = [node]
Fred Drake55c38192000-06-29 19:39:57 +0000231 while event:
Fred Drake1f549022000-09-24 05:21:58 +0000232 token, cur_node = event
233 if cur_node is node:
234 return
Lars Gustäbelec964d52000-10-13 20:53:27 +0000235 if token != END_ELEMENT:
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000236 parents[-1].appendChild(cur_node)
237 if token == START_ELEMENT:
238 parents.append(cur_node)
239 elif token == END_ELEMENT:
240 del parents[-1]
Fred Drake1f549022000-09-24 05:21:58 +0000241 event = self.getEvent()
Fred Drake55c38192000-06-29 19:39:57 +0000242
Fred Drake1f549022000-09-24 05:21:58 +0000243 def getEvent(self):
Fred Drake55c38192000-06-29 19:39:57 +0000244 if not self.pulldom.firstEvent[1]:
Fred Drake1f549022000-09-24 05:21:58 +0000245 self.pulldom.lastEvent = self.pulldom.firstEvent
Fred Drake55c38192000-06-29 19:39:57 +0000246 while not self.pulldom.firstEvent[1]:
Fred Drakec16adce2000-12-14 18:00:18 +0000247 buf = self.stream.read(self.bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000248 if not buf:
Martin v. Löwise3fc7222001-01-27 08:34:21 +0000249 self.parser.close()
Fred Drake55c38192000-06-29 19:39:57 +0000250 return None
Fred Drake1f549022000-09-24 05:21:58 +0000251 self.parser.feed(buf)
252 rc = self.pulldom.firstEvent[1][0]
253 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
Fred Drake55c38192000-06-29 19:39:57 +0000254 return rc
255
Martin v. Löwisb417be22001-02-06 01:16:06 +0000256 def clear(self):
257 "clear(): Explicitly release parsing objects"
258 self.pulldom.clear()
259 del self.pulldom
260 self.parser = None
261 self.stream = None
262
Lars Gustäbelec964d52000-10-13 20:53:27 +0000263class SAX2DOM(PullDOM):
264
265 def startElementNS(self, name, tagName , attrs):
266 PullDOM.startElementNS(self, name, tagName, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000267 curNode = self.elementStack[-1]
268 parentNode = self.elementStack[-2]
269 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000270
271 def startElement(self, name, attrs):
272 PullDOM.startElement(self, name, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000273 curNode = self.elementStack[-1]
274 parentNode = self.elementStack[-2]
275 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000276
277 def processingInstruction(self, target, data):
278 PullDOM.processingInstruction(self, target, data)
279 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000280 parentNode = self.elementStack[-1]
281 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000282
283 def ignorableWhitespace(self, chars):
284 PullDOM.ignorableWhitespace(self, chars)
285 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000286 parentNode = self.elementStack[-1]
287 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000288
289 def characters(self, chars):
290 PullDOM.characters(self, chars)
291 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000292 parentNode = self.elementStack[-1]
293 parentNode.appendChild(node)
Fred Drake16f63292000-10-23 18:09:50 +0000294
Fred Drakec16adce2000-12-14 18:00:18 +0000295
Fred Drake1f549022000-09-24 05:21:58 +0000296default_bufsize = (2 ** 14) - 20
297
Fred Drakec16adce2000-12-14 18:00:18 +0000298def parse(stream_or_string, parser=None, bufsize=None):
299 if bufsize is None:
300 bufsize = default_bufsize
Martin v. Löwis011ea472000-12-28 18:43:02 +0000301 if type(stream_or_string) in _StringTypes:
Fred Drake1f549022000-09-24 05:21:58 +0000302 stream = open(stream_or_string)
Fred Drake55c38192000-06-29 19:39:57 +0000303 else:
Fred Drake1f549022000-09-24 05:21:58 +0000304 stream = stream_or_string
Fred Drake16f63292000-10-23 18:09:50 +0000305 if not parser:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000306 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000307 return DOMEventStream(stream, parser, bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000308
Fred Drake1f549022000-09-24 05:21:58 +0000309def parseString(string, parser=None):
Fred Drake55c38192000-06-29 19:39:57 +0000310 try:
Fred Drake1f549022000-09-24 05:21:58 +0000311 from cStringIO import StringIO
Fred Drake55c38192000-06-29 19:39:57 +0000312 except ImportError:
Fred Drake1f549022000-09-24 05:21:58 +0000313 from StringIO import StringIO
Fred Drake16f63292000-10-23 18:09:50 +0000314
Fred Drake1f549022000-09-24 05:21:58 +0000315 bufsize = len(string)
316 buf = StringIO(string)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000317 if not parser:
318 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000319 return DOMEventStream(buf, parser, bufsize)