blob: b573ba0d18df96700e3cd0d520758be81d90ad55 [file] [log] [blame]
Fred Drakec16adce2000-12-14 18:00:18 +00001import xml.sax
2import xml.sax.handler
Martin v. Löwis011ea472000-12-28 18:43:02 +00003import types
4
5try:
6 _StringTypes = [types.StringType, types.UnicodeType]
7except AttributeError:
8 _StringTypes = [types.StringType]
Fred Drake55c38192000-06-29 19:39:57 +00009
Fred Drake1f549022000-09-24 05:21:58 +000010START_ELEMENT = "START_ELEMENT"
11END_ELEMENT = "END_ELEMENT"
12COMMENT = "COMMENT"
13START_DOCUMENT = "START_DOCUMENT"
14END_DOCUMENT = "END_DOCUMENT"
15PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
16IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
17CHARACTERS = "CHARACTERS"
Fred Drake55c38192000-06-29 19:39:57 +000018
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000019class PullDOM(xml.sax.ContentHandler):
Fred Drakec16adce2000-12-14 18:00:18 +000020 _locator = None
21 document = None
22
23 def __init__(self, documentFactory=None):
24 self.documentFactory = documentFactory
Fred Drake1f549022000-09-24 05:21:58 +000025 self.firstEvent = [None, None]
26 self.lastEvent = self.firstEvent
Martin v. Löwis04a1a542001-01-26 18:53:42 +000027 self.elementStack = []
28 self.push = self.elementStack.append
29 try:
30 self.pop = self.elementStack.pop
31 except AttributeError:
32 # use class' pop instead
33 pass
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000034 self._ns_contexts = [{}] # contains uri -> prefix dicts
35 self._current_context = self._ns_contexts[-1]
Fred Drake55c38192000-06-29 19:39:57 +000036
Martin v. Löwis04a1a542001-01-26 18:53:42 +000037 def pop(self):
38 result = self.elementStack[-1]
Martin v. Löwis52ce0d02001-01-27 08:47:37 +000039 del self.elementStack[-1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +000040 return result
41
Fred Drakec16adce2000-12-14 18:00:18 +000042 def setDocumentLocator(self, locator):
43 self._locator = locator
Fred Drake55c38192000-06-29 19:39:57 +000044
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000045 def startPrefixMapping(self, prefix, uri):
46 self._ns_contexts.append(self._current_context.copy())
Fred Drakec16adce2000-12-14 18:00:18 +000047 self._current_context[uri] = prefix or ''
Fred Drake55c38192000-06-29 19:39:57 +000048
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000049 def endPrefixMapping(self, prefix):
Fred Drakec16adce2000-12-14 18:00:18 +000050 self._current_context = self._ns_contexts.pop()
Fred Drake1f549022000-09-24 05:21:58 +000051
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000052 def startElementNS(self, name, tagName , attrs):
Fred Drakec16adce2000-12-14 18:00:18 +000053 uri, localname = name
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000054 if uri:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000055 # When using namespaces, the reader may or may not
56 # provide us with the original name. If not, create
57 # *a* valid tagName from the current context.
58 if tagName is None:
Guido van Rossum795ad562001-02-05 18:50:15 +000059 prefix = self._current_context[uri]
60 if prefix:
61 tagName = prefix + ":" + localname
62 else:
63 tagName = localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000064 if self.document:
65 node = self.document.createElementNS(uri, tagName)
66 else:
67 node = self.buildDocument(uri, tagName)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000068 else:
69 # When the tagname is not prefixed, it just appears as
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000070 # localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000071 if self.document:
72 node = self.document.createElement(localname)
73 else:
74 node = self.buildDocument(None, localname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000075
76 for aname,value in attrs.items():
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000077 a_uri, a_localname = aname
78 if a_uri:
Guido van Rossum795ad562001-02-05 18:50:15 +000079 prefix = self._current_context[a_uri]
80 if prefix:
81 qname = prefix + ":" + a_localname
82 else:
83 qname = a_localname
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000084 attr = self.document.createAttributeNS(a_uri, qname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000085 else:
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000086 attr = self.document.createAttribute(a_localname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000087 attr.value = value
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000088 node.setAttributeNode(attr)
Fred Drake16f63292000-10-23 18:09:50 +000089
Fred Drake1f549022000-09-24 05:21:58 +000090 self.lastEvent[1] = [(START_ELEMENT, node), None]
91 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +000092 self.push(node)
Fred Drake55c38192000-06-29 19:39:57 +000093
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000094 def endElementNS(self, name, tagName):
Martin v. Löwis04a1a542001-01-26 18:53:42 +000095 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Fred Drake1f549022000-09-24 05:21:58 +000096 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +000097
Lars Gustäbeld178ba62000-10-11 22:34:04 +000098 def startElement(self, name, attrs):
Martin v. Löwisb417be22001-02-06 01:16:06 +000099 if self.document:
100 node = self.document.createElement(name)
101 else:
102 node = self.buildDocument(None, name)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000103
104 for aname,value in attrs.items():
105 attr = self.document.createAttribute(aname)
106 attr.value = value
107 node.setAttributeNode(attr)
Fred Drake16f63292000-10-23 18:09:50 +0000108
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000109 self.lastEvent[1] = [(START_ELEMENT, node), None]
110 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000111 self.push(node)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000112
113 def endElement(self, name):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000114 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000115 self.lastEvent = self.lastEvent[1]
Fred Drake16f63292000-10-23 18:09:50 +0000116
Fred Drake1f549022000-09-24 05:21:58 +0000117 def comment(self, s):
118 node = self.document.createComment(s)
Fred Drake1f549022000-09-24 05:21:58 +0000119 self.lastEvent[1] = [(COMMENT, node), None]
120 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000121
Fred Drake1f549022000-09-24 05:21:58 +0000122 def processingInstruction(self, target, data):
123 node = self.document.createProcessingInstruction(target, data)
Fred Drake16f63292000-10-23 18:09:50 +0000124
Fred Drake1f549022000-09-24 05:21:58 +0000125 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
126 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000127
Fred Drake1f549022000-09-24 05:21:58 +0000128 def ignorableWhitespace(self, chars):
Fred Drakec16adce2000-12-14 18:00:18 +0000129 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000130 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
131 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000132
Fred Drake1f549022000-09-24 05:21:58 +0000133 def characters(self, chars):
134 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000135 self.lastEvent[1] = [(CHARACTERS, node), None]
136 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000137
Fred Drake1f549022000-09-24 05:21:58 +0000138 def startDocument(self):
Fred Drakec16adce2000-12-14 18:00:18 +0000139 if self.documentFactory is None:
140 import xml.dom.minidom
141 self.documentFactory = xml.dom.minidom.Document.implementation
Martin v. Löwisb417be22001-02-06 01:16:06 +0000142
143 def buildDocument(self, uri, tagname):
144 # Can't do that in startDocument, since we need the tagname
145 # XXX: obtain DocumentType
146 node = self.documentFactory.createDocument(uri, tagname, None)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000147 self.document = node
Fred Drake1f549022000-09-24 05:21:58 +0000148 self.lastEvent[1] = [(START_DOCUMENT, node), None]
149 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000150 self.push(node)
Martin v. Löwisb417be22001-02-06 01:16:06 +0000151 return node.firstChild
Fred Drake55c38192000-06-29 19:39:57 +0000152
Fred Drake1f549022000-09-24 05:21:58 +0000153 def endDocument(self):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000154 self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
155 self.pop()
Fred Drake55c38192000-06-29 19:39:57 +0000156
Martin v. Löwisb417be22001-02-06 01:16:06 +0000157 def clear(self):
158 "clear(): Explicitly release parsing structures"
159 self.document = None
160
Fred Drake55c38192000-06-29 19:39:57 +0000161class ErrorHandler:
Fred Drake1f549022000-09-24 05:21:58 +0000162 def warning(self, exception):
Fred Drake55c38192000-06-29 19:39:57 +0000163 print exception
Fred Drake1f549022000-09-24 05:21:58 +0000164 def error(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000165 raise exception
Fred Drake1f549022000-09-24 05:21:58 +0000166 def fatalError(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000167 raise exception
Fred Drake55c38192000-06-29 19:39:57 +0000168
169class DOMEventStream:
Fred Drake1f549022000-09-24 05:21:58 +0000170 def __init__(self, stream, parser, bufsize):
171 self.stream = stream
172 self.parser = parser
173 self.bufsize = bufsize
Fred Drake55c38192000-06-29 19:39:57 +0000174 self.reset()
175
Fred Drake1f549022000-09-24 05:21:58 +0000176 def reset(self):
Fred Drake55c38192000-06-29 19:39:57 +0000177 self.pulldom = PullDOM()
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000178 # This content handler relies on namespace support
Fred Drakec16adce2000-12-14 18:00:18 +0000179 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
Fred Drake1f549022000-09-24 05:21:58 +0000180 self.parser.setContentHandler(self.pulldom)
Fred Drake55c38192000-06-29 19:39:57 +0000181
Fred Drake1f549022000-09-24 05:21:58 +0000182 def __getitem__(self, pos):
183 rc = self.getEvent()
184 if rc:
185 return rc
Fred Drake55c38192000-06-29 19:39:57 +0000186 raise IndexError
187
Fred Drake1f549022000-09-24 05:21:58 +0000188 def expandNode(self, node):
189 event = self.getEvent()
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000190 parents = [node]
Fred Drake55c38192000-06-29 19:39:57 +0000191 while event:
Fred Drake1f549022000-09-24 05:21:58 +0000192 token, cur_node = event
193 if cur_node is node:
194 return
Lars Gustäbelec964d52000-10-13 20:53:27 +0000195 if token != END_ELEMENT:
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000196 parents[-1].appendChild(cur_node)
197 if token == START_ELEMENT:
198 parents.append(cur_node)
199 elif token == END_ELEMENT:
200 del parents[-1]
Fred Drake1f549022000-09-24 05:21:58 +0000201 event = self.getEvent()
Fred Drake55c38192000-06-29 19:39:57 +0000202
Fred Drake1f549022000-09-24 05:21:58 +0000203 def getEvent(self):
Fred Drake55c38192000-06-29 19:39:57 +0000204 if not self.pulldom.firstEvent[1]:
Fred Drake1f549022000-09-24 05:21:58 +0000205 self.pulldom.lastEvent = self.pulldom.firstEvent
Fred Drake55c38192000-06-29 19:39:57 +0000206 while not self.pulldom.firstEvent[1]:
Fred Drakec16adce2000-12-14 18:00:18 +0000207 buf = self.stream.read(self.bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000208 if not buf:
Martin v. Löwise3fc7222001-01-27 08:34:21 +0000209 self.parser.close()
Fred Drake55c38192000-06-29 19:39:57 +0000210 return None
Fred Drake1f549022000-09-24 05:21:58 +0000211 self.parser.feed(buf)
212 rc = self.pulldom.firstEvent[1][0]
213 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
Fred Drake55c38192000-06-29 19:39:57 +0000214 return rc
215
Martin v. Löwisb417be22001-02-06 01:16:06 +0000216 def clear(self):
217 "clear(): Explicitly release parsing objects"
218 self.pulldom.clear()
219 del self.pulldom
220 self.parser = None
221 self.stream = None
222
Lars Gustäbelec964d52000-10-13 20:53:27 +0000223class SAX2DOM(PullDOM):
224
225 def startElementNS(self, name, tagName , attrs):
226 PullDOM.startElementNS(self, name, tagName, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000227 curNode = self.elementStack[-1]
228 parentNode = self.elementStack[-2]
229 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000230
231 def startElement(self, name, attrs):
232 PullDOM.startElement(self, name, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000233 curNode = self.elementStack[-1]
234 parentNode = self.elementStack[-2]
235 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000236
237 def processingInstruction(self, target, data):
238 PullDOM.processingInstruction(self, target, data)
239 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000240 parentNode = self.elementStack[-1]
241 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000242
243 def ignorableWhitespace(self, chars):
244 PullDOM.ignorableWhitespace(self, chars)
245 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000246 parentNode = self.elementStack[-1]
247 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000248
249 def characters(self, chars):
250 PullDOM.characters(self, chars)
251 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000252 parentNode = self.elementStack[-1]
253 parentNode.appendChild(node)
Fred Drake16f63292000-10-23 18:09:50 +0000254
Fred Drakec16adce2000-12-14 18:00:18 +0000255
Fred Drake1f549022000-09-24 05:21:58 +0000256default_bufsize = (2 ** 14) - 20
257
Fred Drakec16adce2000-12-14 18:00:18 +0000258def parse(stream_or_string, parser=None, bufsize=None):
259 if bufsize is None:
260 bufsize = default_bufsize
Martin v. Löwis011ea472000-12-28 18:43:02 +0000261 if type(stream_or_string) in _StringTypes:
Fred Drake1f549022000-09-24 05:21:58 +0000262 stream = open(stream_or_string)
Fred Drake55c38192000-06-29 19:39:57 +0000263 else:
Fred Drake1f549022000-09-24 05:21:58 +0000264 stream = stream_or_string
Fred Drake16f63292000-10-23 18:09:50 +0000265 if not parser:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000266 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000267 return DOMEventStream(stream, parser, bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000268
Fred Drake1f549022000-09-24 05:21:58 +0000269def parseString(string, parser=None):
Fred Drake55c38192000-06-29 19:39:57 +0000270 try:
Fred Drake1f549022000-09-24 05:21:58 +0000271 from cStringIO import StringIO
Fred Drake55c38192000-06-29 19:39:57 +0000272 except ImportError:
Fred Drake1f549022000-09-24 05:21:58 +0000273 from StringIO import StringIO
Fred Drake16f63292000-10-23 18:09:50 +0000274
Fred Drake1f549022000-09-24 05:21:58 +0000275 bufsize = len(string)
276 buf = StringIO(string)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000277 if not parser:
278 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000279 return DOMEventStream(buf, parser, bufsize)