blob: 5fc215ab73449672ebbbacd7cc1b4b106f0848a4 [file] [log] [blame]
Fred Drakec16adce2000-12-14 18:00:18 +00001import xml.sax
2import xml.sax.handler
Martin v. Löwis011ea472000-12-28 18:43:02 +00003import types
4
5try:
6 _StringTypes = [types.StringType, types.UnicodeType]
7except AttributeError:
8 _StringTypes = [types.StringType]
Fred Drake55c38192000-06-29 19:39:57 +00009
Fred Drake1f549022000-09-24 05:21:58 +000010START_ELEMENT = "START_ELEMENT"
11END_ELEMENT = "END_ELEMENT"
12COMMENT = "COMMENT"
13START_DOCUMENT = "START_DOCUMENT"
14END_DOCUMENT = "END_DOCUMENT"
15PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
16IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
17CHARACTERS = "CHARACTERS"
Fred Drake55c38192000-06-29 19:39:57 +000018
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000019class PullDOM(xml.sax.ContentHandler):
Fred Drakec16adce2000-12-14 18:00:18 +000020 _locator = None
21 document = None
22
23 def __init__(self, documentFactory=None):
24 self.documentFactory = documentFactory
Fred Drake1f549022000-09-24 05:21:58 +000025 self.firstEvent = [None, None]
26 self.lastEvent = self.firstEvent
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000027 self._ns_contexts = [{}] # contains uri -> prefix dicts
28 self._current_context = self._ns_contexts[-1]
Fred Drake55c38192000-06-29 19:39:57 +000029
Fred Drakec16adce2000-12-14 18:00:18 +000030 def setDocumentLocator(self, locator):
31 self._locator = locator
Fred Drake55c38192000-06-29 19:39:57 +000032
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000033 def startPrefixMapping(self, prefix, uri):
34 self._ns_contexts.append(self._current_context.copy())
Fred Drakec16adce2000-12-14 18:00:18 +000035 self._current_context[uri] = prefix or ''
Fred Drake55c38192000-06-29 19:39:57 +000036
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000037 def endPrefixMapping(self, prefix):
Fred Drakec16adce2000-12-14 18:00:18 +000038 self._current_context = self._ns_contexts.pop()
Fred Drake1f549022000-09-24 05:21:58 +000039
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000040 def startElementNS(self, name, tagName , attrs):
Fred Drakec16adce2000-12-14 18:00:18 +000041 uri, localname = name
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000042 if uri:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000043 # When using namespaces, the reader may or may not
44 # provide us with the original name. If not, create
45 # *a* valid tagName from the current context.
46 if tagName is None:
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000047 tagName = self._current_context[uri] + ":" + localname
48 node = self.document.createElementNS(uri, tagName)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000049 else:
50 # When the tagname is not prefixed, it just appears as
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000051 # localname
52 node = self.document.createElement(localname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000053
54 for aname,value in attrs.items():
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000055 a_uri, a_localname = aname
56 if a_uri:
57 qname = self._current_context[a_uri] + ":" + a_localname
58 attr = self.document.createAttributeNS(a_uri, qname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000059 else:
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000060 attr = self.document.createAttribute(a_localname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000061 attr.value = value
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000062 node.setAttributeNode(attr)
Fred Drake16f63292000-10-23 18:09:50 +000063
Andrew M. Kuchling04a45e92000-12-20 14:47:24 +000064## print self.curNode, self.curNode.childNodes, node, node.parentNode
65 self.curNode.appendChild(node)
66# node.parentNode = self.curNode
Fred Drake55c38192000-06-29 19:39:57 +000067 self.curNode = node
Lars Gustäbele84bf752000-09-24 18:31:37 +000068
Fred Drake1f549022000-09-24 05:21:58 +000069 self.lastEvent[1] = [(START_ELEMENT, node), None]
70 self.lastEvent = self.lastEvent[1]
71 #self.events.append((START_ELEMENT, node))
Fred Drake55c38192000-06-29 19:39:57 +000072
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000073 def endElementNS(self, name, tagName):
Fred Drake55c38192000-06-29 19:39:57 +000074 node = self.curNode
Fred Drake1f549022000-09-24 05:21:58 +000075 self.lastEvent[1] = [(END_ELEMENT, node), None]
76 self.lastEvent = self.lastEvent[1]
77 #self.events.append((END_ELEMENT, node))
Fred Drakec16adce2000-12-14 18:00:18 +000078 self.curNode = self.curNode.parentNode
Fred Drake55c38192000-06-29 19:39:57 +000079
Lars Gustäbeld178ba62000-10-11 22:34:04 +000080 def startElement(self, name, attrs):
81 node = self.document.createElement(name)
82
83 for aname,value in attrs.items():
84 attr = self.document.createAttribute(aname)
85 attr.value = value
86 node.setAttributeNode(attr)
Fred Drake16f63292000-10-23 18:09:50 +000087
Andrew M. Kuchling04a45e92000-12-20 14:47:24 +000088 #node.parentNode = self.curNode
89 self.curNode.appendChild(node)
Lars Gustäbeld178ba62000-10-11 22:34:04 +000090 self.curNode = node
91
92 self.lastEvent[1] = [(START_ELEMENT, node), None]
93 self.lastEvent = self.lastEvent[1]
94 #self.events.append((START_ELEMENT, node))
95
96 def endElement(self, name):
97 node = self.curNode
98 self.lastEvent[1] = [(END_ELEMENT, node), None]
99 self.lastEvent = self.lastEvent[1]
100 #self.events.append((END_ELEMENT, node))
101 self.curNode = node.parentNode
Fred Drake16f63292000-10-23 18:09:50 +0000102
Fred Drake1f549022000-09-24 05:21:58 +0000103 def comment(self, s):
104 node = self.document.createComment(s)
Andrew M. Kuchling04a45e92000-12-20 14:47:24 +0000105 self.curNode.appendChild(node)
106# parent = self.curNode
107# node.parentNode = parent
Fred Drake1f549022000-09-24 05:21:58 +0000108 self.lastEvent[1] = [(COMMENT, node), None]
109 self.lastEvent = self.lastEvent[1]
110 #self.events.append((COMMENT, node))
Fred Drake55c38192000-06-29 19:39:57 +0000111
Fred Drake1f549022000-09-24 05:21:58 +0000112 def processingInstruction(self, target, data):
113 node = self.document.createProcessingInstruction(target, data)
Fred Drake16f63292000-10-23 18:09:50 +0000114
Andrew M. Kuchling04a45e92000-12-20 14:47:24 +0000115 self.curNode.appendChild(node)
116# parent = self.curNode
117# node.parentNode = parent
Fred Drake1f549022000-09-24 05:21:58 +0000118 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
119 self.lastEvent = self.lastEvent[1]
120 #self.events.append((PROCESSING_INSTRUCTION, node))
Fred Drake55c38192000-06-29 19:39:57 +0000121
Fred Drake1f549022000-09-24 05:21:58 +0000122 def ignorableWhitespace(self, chars):
Fred Drakec16adce2000-12-14 18:00:18 +0000123 node = self.document.createTextNode(chars)
Andrew M. Kuchling04a45e92000-12-20 14:47:24 +0000124 self.curNode.appendChild(node)
125# parent = self.curNode
126# node.parentNode = parent
Fred Drake1f549022000-09-24 05:21:58 +0000127 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
128 self.lastEvent = self.lastEvent[1]
129 #self.events.append((IGNORABLE_WHITESPACE, node))
Fred Drake55c38192000-06-29 19:39:57 +0000130
Fred Drake1f549022000-09-24 05:21:58 +0000131 def characters(self, chars):
132 node = self.document.createTextNode(chars)
Andrew M. Kuchling04a45e92000-12-20 14:47:24 +0000133 self.curNode.appendChild(node)
134# parent = self.curNode
135# node.parentNode = parent
Fred Drake1f549022000-09-24 05:21:58 +0000136 self.lastEvent[1] = [(CHARACTERS, node), None]
137 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000138
Fred Drake1f549022000-09-24 05:21:58 +0000139 def startDocument(self):
Fred Drakec16adce2000-12-14 18:00:18 +0000140 publicId = systemId = None
141 if self._locator:
142 publicId = self._locator.getPublicId()
143 systemId = self._locator.getSystemId()
144 if self.documentFactory is None:
145 import xml.dom.minidom
146 self.documentFactory = xml.dom.minidom.Document.implementation
147 node = self.documentFactory.createDocument(None, publicId, systemId)
148 self.curNode = self.document = node
Fred Drake1f549022000-09-24 05:21:58 +0000149 self.lastEvent[1] = [(START_DOCUMENT, node), None]
150 self.lastEvent = self.lastEvent[1]
151 #self.events.append((START_DOCUMENT, node))
Fred Drake55c38192000-06-29 19:39:57 +0000152
Fred Drake1f549022000-09-24 05:21:58 +0000153 def endDocument(self):
Fred Drakec16adce2000-12-14 18:00:18 +0000154 assert self.curNode.parentNode is None, \
155 "not all elements have been properly closed"
156 assert self.curNode.documentElement is not None, \
157 "document does not contain a root element"
158 node = self.curNode.documentElement
Fred Drake1f549022000-09-24 05:21:58 +0000159 self.lastEvent[1] = [(END_DOCUMENT, node), None]
160 #self.events.append((END_DOCUMENT, self.curNode))
Fred Drake55c38192000-06-29 19:39:57 +0000161
162class ErrorHandler:
Fred Drake1f549022000-09-24 05:21:58 +0000163 def warning(self, exception):
Fred Drake55c38192000-06-29 19:39:57 +0000164 print exception
Fred Drake1f549022000-09-24 05:21:58 +0000165 def error(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000166 raise exception
Fred Drake1f549022000-09-24 05:21:58 +0000167 def fatalError(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000168 raise exception
Fred Drake55c38192000-06-29 19:39:57 +0000169
170class DOMEventStream:
Fred Drake1f549022000-09-24 05:21:58 +0000171 def __init__(self, stream, parser, bufsize):
172 self.stream = stream
173 self.parser = parser
174 self.bufsize = bufsize
Fred Drake55c38192000-06-29 19:39:57 +0000175 self.reset()
176
Fred Drake1f549022000-09-24 05:21:58 +0000177 def reset(self):
Fred Drake55c38192000-06-29 19:39:57 +0000178 self.pulldom = PullDOM()
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000179 # This content handler relies on namespace support
Fred Drakec16adce2000-12-14 18:00:18 +0000180 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
Fred Drake1f549022000-09-24 05:21:58 +0000181 self.parser.setContentHandler(self.pulldom)
Fred Drake55c38192000-06-29 19:39:57 +0000182
Fred Drake1f549022000-09-24 05:21:58 +0000183 def __getitem__(self, pos):
184 rc = self.getEvent()
185 if rc:
186 return rc
Fred Drake55c38192000-06-29 19:39:57 +0000187 raise IndexError
188
Fred Drake1f549022000-09-24 05:21:58 +0000189 def expandNode(self, node):
190 event = self.getEvent()
Fred Drake55c38192000-06-29 19:39:57 +0000191 while event:
Fred Drake1f549022000-09-24 05:21:58 +0000192 token, cur_node = event
193 if cur_node is node:
194 return
Lars Gustäbelec964d52000-10-13 20:53:27 +0000195 if token != END_ELEMENT:
196 cur_node.parentNode.appendChild(cur_node)
Fred Drake1f549022000-09-24 05:21:58 +0000197 event = self.getEvent()
Fred Drake55c38192000-06-29 19:39:57 +0000198
Fred Drake1f549022000-09-24 05:21:58 +0000199 def getEvent(self):
Fred Drake55c38192000-06-29 19:39:57 +0000200 if not self.pulldom.firstEvent[1]:
Fred Drake1f549022000-09-24 05:21:58 +0000201 self.pulldom.lastEvent = self.pulldom.firstEvent
Fred Drake55c38192000-06-29 19:39:57 +0000202 while not self.pulldom.firstEvent[1]:
Fred Drakec16adce2000-12-14 18:00:18 +0000203 buf = self.stream.read(self.bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000204 if not buf:
205 #FIXME: why doesn't Expat close work?
206 #self.parser.close()
207 return None
Fred Drake1f549022000-09-24 05:21:58 +0000208 self.parser.feed(buf)
209 rc = self.pulldom.firstEvent[1][0]
210 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
Fred Drake55c38192000-06-29 19:39:57 +0000211 return rc
212
Lars Gustäbelec964d52000-10-13 20:53:27 +0000213class SAX2DOM(PullDOM):
214
215 def startElementNS(self, name, tagName , attrs):
216 PullDOM.startElementNS(self, name, tagName, attrs)
217 self.curNode.parentNode.appendChild(self.curNode)
218
219 def startElement(self, name, attrs):
220 PullDOM.startElement(self, name, attrs)
221 self.curNode.parentNode.appendChild(self.curNode)
222
223 def processingInstruction(self, target, data):
224 PullDOM.processingInstruction(self, target, data)
225 node = self.lastEvent[0][1]
Fred Drake16f63292000-10-23 18:09:50 +0000226 node.parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000227
228 def ignorableWhitespace(self, chars):
229 PullDOM.ignorableWhitespace(self, chars)
230 node = self.lastEvent[0][1]
Fred Drake16f63292000-10-23 18:09:50 +0000231 node.parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000232
233 def characters(self, chars):
234 PullDOM.characters(self, chars)
235 node = self.lastEvent[0][1]
Fred Drake16f63292000-10-23 18:09:50 +0000236 node.parentNode.appendChild(node)
237
Fred Drakec16adce2000-12-14 18:00:18 +0000238
Fred Drake1f549022000-09-24 05:21:58 +0000239default_bufsize = (2 ** 14) - 20
240
Fred Drakec16adce2000-12-14 18:00:18 +0000241def parse(stream_or_string, parser=None, bufsize=None):
242 if bufsize is None:
243 bufsize = default_bufsize
Martin v. Löwis011ea472000-12-28 18:43:02 +0000244 if type(stream_or_string) in _StringTypes:
Fred Drake1f549022000-09-24 05:21:58 +0000245 stream = open(stream_or_string)
Fred Drake55c38192000-06-29 19:39:57 +0000246 else:
Fred Drake1f549022000-09-24 05:21:58 +0000247 stream = stream_or_string
Fred Drake16f63292000-10-23 18:09:50 +0000248 if not parser:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000249 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000250 return DOMEventStream(stream, parser, bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000251
Fred Drake1f549022000-09-24 05:21:58 +0000252def parseString(string, parser=None):
Fred Drake55c38192000-06-29 19:39:57 +0000253 try:
Fred Drake1f549022000-09-24 05:21:58 +0000254 from cStringIO import StringIO
Fred Drake55c38192000-06-29 19:39:57 +0000255 except ImportError:
Fred Drake1f549022000-09-24 05:21:58 +0000256 from StringIO import StringIO
Fred Drake16f63292000-10-23 18:09:50 +0000257
Fred Drake1f549022000-09-24 05:21:58 +0000258 bufsize = len(string)
259 buf = StringIO(string)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000260 if not parser:
261 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000262 return DOMEventStream(buf, parser, bufsize)