blob: 96a8d59519ef45957b101b7295d044c316d4e84c [file] [log] [blame]
Thomas Wouters0e3f5912006-08-11 14:57:12 +00001import xml.sax
2import xml.sax.handler
Martin v. Löwis011ea472000-12-28 18:43:02 +00003
Fred Drake1f549022000-09-24 05:21:58 +00004START_ELEMENT = "START_ELEMENT"
5END_ELEMENT = "END_ELEMENT"
6COMMENT = "COMMENT"
7START_DOCUMENT = "START_DOCUMENT"
8END_DOCUMENT = "END_DOCUMENT"
9PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"
10IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"
11CHARACTERS = "CHARACTERS"
Fred Drake55c38192000-06-29 19:39:57 +000012
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013class PullDOM(xml.sax.ContentHandler):
Fred Drakec16adce2000-12-14 18:00:18 +000014 _locator = None
15 document = None
16
17 def __init__(self, documentFactory=None):
Thomas Wouters0e3f5912006-08-11 14:57:12 +000018 from xml.dom import XML_NAMESPACE
Fred Drakec16adce2000-12-14 18:00:18 +000019 self.documentFactory = documentFactory
Fred Drake1f549022000-09-24 05:21:58 +000020 self.firstEvent = [None, None]
21 self.lastEvent = self.firstEvent
Martin v. Löwis04a1a542001-01-26 18:53:42 +000022 self.elementStack = []
23 self.push = self.elementStack.append
24 try:
25 self.pop = self.elementStack.pop
26 except AttributeError:
27 # use class' pop instead
28 pass
Martin v. Löwis0e2d8812002-06-30 07:32:56 +000029 self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000030 self._current_context = self._ns_contexts[-1]
Martin v. Löwis126f2f62001-03-13 10:50:13 +000031 self.pending_events = []
Fred Drake55c38192000-06-29 19:39:57 +000032
Martin v. Löwis04a1a542001-01-26 18:53:42 +000033 def pop(self):
34 result = self.elementStack[-1]
Martin v. Löwis52ce0d02001-01-27 08:47:37 +000035 del self.elementStack[-1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +000036 return result
37
Fred Drakec16adce2000-12-14 18:00:18 +000038 def setDocumentLocator(self, locator):
39 self._locator = locator
Fred Drake55c38192000-06-29 19:39:57 +000040
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000041 def startPrefixMapping(self, prefix, uri):
Martin v. Löwis984158d2001-07-18 15:30:25 +000042 if not hasattr(self, '_xmlns_attrs'):
43 self._xmlns_attrs = []
44 self._xmlns_attrs.append((prefix or 'xmlns', uri))
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000045 self._ns_contexts.append(self._current_context.copy())
Fred Drake7fd173b2001-11-30 22:22:26 +000046 self._current_context[uri] = prefix or None
Fred Drake55c38192000-06-29 19:39:57 +000047
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000048 def endPrefixMapping(self, prefix):
Fred Drakec16adce2000-12-14 18:00:18 +000049 self._current_context = self._ns_contexts.pop()
Fred Drake1f549022000-09-24 05:21:58 +000050
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000051 def startElementNS(self, name, tagName , attrs):
Martin v. Löwis984158d2001-07-18 15:30:25 +000052 # Retrieve xml namespace declaration attributes.
53 xmlns_uri = 'http://www.w3.org/2000/xmlns/'
54 xmlns_attrs = getattr(self, '_xmlns_attrs', None)
55 if xmlns_attrs is not None:
56 for aname, value in xmlns_attrs:
57 attrs._attrs[(xmlns_uri, aname)] = value
58 self._xmlns_attrs = []
Fred Drakec16adce2000-12-14 18:00:18 +000059 uri, localname = name
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000060 if uri:
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000061 # When using namespaces, the reader may or may not
62 # provide us with the original name. If not, create
63 # *a* valid tagName from the current context.
64 if tagName is None:
Guido van Rossum795ad562001-02-05 18:50:15 +000065 prefix = self._current_context[uri]
66 if prefix:
67 tagName = prefix + ":" + localname
68 else:
69 tagName = localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000070 if self.document:
71 node = self.document.createElementNS(uri, tagName)
72 else:
73 node = self.buildDocument(uri, tagName)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000074 else:
75 # When the tagname is not prefixed, it just appears as
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000076 # localname
Martin v. Löwisb417be22001-02-06 01:16:06 +000077 if self.document:
78 node = self.document.createElement(localname)
79 else:
80 node = self.buildDocument(None, localname)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000081
82 for aname,value in attrs.items():
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000083 a_uri, a_localname = aname
Martin v. Löwis984158d2001-07-18 15:30:25 +000084 if a_uri == xmlns_uri:
85 if a_localname == 'xmlns':
86 qname = a_localname
87 else:
88 qname = 'xmlns:' + a_localname
89 attr = self.document.createAttributeNS(a_uri, qname)
90 node.setAttributeNodeNS(attr)
91 elif a_uri:
Guido van Rossum795ad562001-02-05 18:50:15 +000092 prefix = self._current_context[a_uri]
93 if prefix:
94 qname = prefix + ":" + a_localname
95 else:
96 qname = a_localname
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +000097 attr = self.document.createAttributeNS(a_uri, qname)
Fred Drake6526bf82001-03-23 04:39:24 +000098 node.setAttributeNodeNS(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +000099 else:
Martin v. Löwis2c8a89c2000-10-06 22:36:03 +0000100 attr = self.document.createAttribute(a_localname)
Fred Drake6526bf82001-03-23 04:39:24 +0000101 node.setAttributeNode(attr)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000102 attr.value = value
Fred Drake16f63292000-10-23 18:09:50 +0000103
Fred Drake1f549022000-09-24 05:21:58 +0000104 self.lastEvent[1] = [(START_ELEMENT, node), None]
105 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000106 self.push(node)
Fred Drake55c38192000-06-29 19:39:57 +0000107
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000108 def endElementNS(self, name, tagName):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000109 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Fred Drake1f549022000-09-24 05:21:58 +0000110 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000111
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000112 def startElement(self, name, attrs):
Martin v. Löwisb417be22001-02-06 01:16:06 +0000113 if self.document:
114 node = self.document.createElement(name)
115 else:
116 node = self.buildDocument(None, name)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000117
118 for aname,value in attrs.items():
119 attr = self.document.createAttribute(aname)
120 attr.value = value
121 node.setAttributeNode(attr)
Fred Drake16f63292000-10-23 18:09:50 +0000122
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000123 self.lastEvent[1] = [(START_ELEMENT, node), None]
124 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000125 self.push(node)
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000126
127 def endElement(self, name):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000128 self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]
Lars Gustäbeld178ba62000-10-11 22:34:04 +0000129 self.lastEvent = self.lastEvent[1]
Fred Drake16f63292000-10-23 18:09:50 +0000130
Fred Drake1f549022000-09-24 05:21:58 +0000131 def comment(self, s):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000132 if self.document:
133 node = self.document.createComment(s)
134 self.lastEvent[1] = [(COMMENT, node), None]
135 self.lastEvent = self.lastEvent[1]
136 else:
137 event = [(COMMENT, s), None]
138 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000139
Fred Drake1f549022000-09-24 05:21:58 +0000140 def processingInstruction(self, target, data):
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000141 if self.document:
142 node = self.document.createProcessingInstruction(target, data)
143 self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]
144 self.lastEvent = self.lastEvent[1]
145 else:
146 event = [(PROCESSING_INSTRUCTION, target, data), None]
147 self.pending_events.append(event)
Fred Drake55c38192000-06-29 19:39:57 +0000148
Fred Drake1f549022000-09-24 05:21:58 +0000149 def ignorableWhitespace(self, chars):
Fred Drakec16adce2000-12-14 18:00:18 +0000150 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000151 self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]
152 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000153
Fred Drake1f549022000-09-24 05:21:58 +0000154 def characters(self, chars):
155 node = self.document.createTextNode(chars)
Fred Drake1f549022000-09-24 05:21:58 +0000156 self.lastEvent[1] = [(CHARACTERS, node), None]
157 self.lastEvent = self.lastEvent[1]
Fred Drake55c38192000-06-29 19:39:57 +0000158
Fred Drake1f549022000-09-24 05:21:58 +0000159 def startDocument(self):
Fred Drakec16adce2000-12-14 18:00:18 +0000160 if self.documentFactory is None:
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000161 import xml.dom.minidom
162 self.documentFactory = xml.dom.minidom.Document.implementation
Martin v. Löwisb417be22001-02-06 01:16:06 +0000163
164 def buildDocument(self, uri, tagname):
165 # Can't do that in startDocument, since we need the tagname
166 # XXX: obtain DocumentType
167 node = self.documentFactory.createDocument(uri, tagname, None)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000168 self.document = node
Fred Drake1f549022000-09-24 05:21:58 +0000169 self.lastEvent[1] = [(START_DOCUMENT, node), None]
170 self.lastEvent = self.lastEvent[1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000171 self.push(node)
Martin v. Löwis126f2f62001-03-13 10:50:13 +0000172 # Put everything we have seen so far into the document
173 for e in self.pending_events:
174 if e[0][0] == PROCESSING_INSTRUCTION:
175 _,target,data = e[0]
176 n = self.document.createProcessingInstruction(target, data)
177 e[0] = (PROCESSING_INSTRUCTION, n)
178 elif e[0][0] == COMMENT:
179 n = self.document.createComment(e[0][1])
180 e[0] = (COMMENT, n)
181 else:
182 raise AssertionError("Unknown pending event ",e[0][0])
183 self.lastEvent[1] = e
184 self.lastEvent = e
185 self.pending_events = None
Martin v. Löwisb417be22001-02-06 01:16:06 +0000186 return node.firstChild
Fred Drake55c38192000-06-29 19:39:57 +0000187
Fred Drake1f549022000-09-24 05:21:58 +0000188 def endDocument(self):
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000189 self.lastEvent[1] = [(END_DOCUMENT, self.document), None]
190 self.pop()
Fred Drake55c38192000-06-29 19:39:57 +0000191
Martin v. Löwisb417be22001-02-06 01:16:06 +0000192 def clear(self):
193 "clear(): Explicitly release parsing structures"
194 self.document = None
195
Fred Drake55c38192000-06-29 19:39:57 +0000196class ErrorHandler:
Fred Drake1f549022000-09-24 05:21:58 +0000197 def warning(self, exception):
Guido van Rossumbe19ed72007-02-09 05:37:30 +0000198 print(exception)
Fred Drake1f549022000-09-24 05:21:58 +0000199 def error(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000200 raise exception
Fred Drake1f549022000-09-24 05:21:58 +0000201 def fatalError(self, exception):
Fred Drake16f63292000-10-23 18:09:50 +0000202 raise exception
Fred Drake55c38192000-06-29 19:39:57 +0000203
204class DOMEventStream:
Fred Drake1f549022000-09-24 05:21:58 +0000205 def __init__(self, stream, parser, bufsize):
206 self.stream = stream
207 self.parser = parser
208 self.bufsize = bufsize
Fred Drake7fd173b2001-11-30 22:22:26 +0000209 if not hasattr(self.parser, 'feed'):
210 self.getEvent = self._slurp
Fred Drake55c38192000-06-29 19:39:57 +0000211 self.reset()
212
Fred Drake1f549022000-09-24 05:21:58 +0000213 def reset(self):
Fred Drake55c38192000-06-29 19:39:57 +0000214 self.pulldom = PullDOM()
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000215 # This content handler relies on namespace support
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000216 self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)
Fred Drake1f549022000-09-24 05:21:58 +0000217 self.parser.setContentHandler(self.pulldom)
Fred Drake55c38192000-06-29 19:39:57 +0000218
Fred Drake1f549022000-09-24 05:21:58 +0000219 def __getitem__(self, pos):
Berker Peksag84a13fb2018-08-11 09:05:04 +0300220 import warnings
221 warnings.warn(
222 "DOMEventStream's __getitem__ method ignores 'pos' parameter. "
223 "Use iterator protocol instead.",
224 DeprecationWarning,
225 stacklevel=2
226 )
Fred Drake1f549022000-09-24 05:21:58 +0000227 rc = self.getEvent()
228 if rc:
229 return rc
Fred Drake55c38192000-06-29 19:39:57 +0000230 raise IndexError
231
Georg Brandla18af4e2007-04-21 15:47:16 +0000232 def __next__(self):
Andrew M. Kuchlingbdf1f192002-03-20 23:56:34 +0000233 rc = self.getEvent()
234 if rc:
235 return rc
236 raise StopIteration
237
238 def __iter__(self):
239 return self
Tim Peters0eadaac2003-04-24 16:02:54 +0000240
Fred Drake1f549022000-09-24 05:21:58 +0000241 def expandNode(self, node):
242 event = self.getEvent()
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000243 parents = [node]
Fred Drake55c38192000-06-29 19:39:57 +0000244 while event:
Fred Drake1f549022000-09-24 05:21:58 +0000245 token, cur_node = event
246 if cur_node is node:
247 return
Lars Gustäbelec964d52000-10-13 20:53:27 +0000248 if token != END_ELEMENT:
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000249 parents[-1].appendChild(cur_node)
250 if token == START_ELEMENT:
251 parents.append(cur_node)
252 elif token == END_ELEMENT:
253 del parents[-1]
Fred Drake1f549022000-09-24 05:21:58 +0000254 event = self.getEvent()
Fred Drake55c38192000-06-29 19:39:57 +0000255
Fred Drake1f549022000-09-24 05:21:58 +0000256 def getEvent(self):
Fred Drake7fd173b2001-11-30 22:22:26 +0000257 # use IncrementalParser interface, so we get the desired
258 # pull effect
Fred Drake55c38192000-06-29 19:39:57 +0000259 if not self.pulldom.firstEvent[1]:
Fred Drake1f549022000-09-24 05:21:58 +0000260 self.pulldom.lastEvent = self.pulldom.firstEvent
Fred Drake55c38192000-06-29 19:39:57 +0000261 while not self.pulldom.firstEvent[1]:
Fred Drakec16adce2000-12-14 18:00:18 +0000262 buf = self.stream.read(self.bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000263 if not buf:
Martin v. Löwise3fc7222001-01-27 08:34:21 +0000264 self.parser.close()
Fred Drake55c38192000-06-29 19:39:57 +0000265 return None
Fred Drake1f549022000-09-24 05:21:58 +0000266 self.parser.feed(buf)
267 rc = self.pulldom.firstEvent[1][0]
268 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
Fred Drake55c38192000-06-29 19:39:57 +0000269 return rc
270
Fred Drake7fd173b2001-11-30 22:22:26 +0000271 def _slurp(self):
272 """ Fallback replacement for getEvent() using the
273 standard SAX2 interface, which means we slurp the
274 SAX events into memory (no performance gain, but
275 we are compatible to all SAX parsers).
276 """
277 self.parser.parse(self.stream)
278 self.getEvent = self._emit
279 return self._emit()
280
281 def _emit(self):
282 """ Fallback replacement for getEvent() that emits
283 the events that _slurp() read previously.
284 """
285 rc = self.pulldom.firstEvent[1][0]
286 self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]
287 return rc
288
Martin v. Löwisb417be22001-02-06 01:16:06 +0000289 def clear(self):
Fred Drake7fd173b2001-11-30 22:22:26 +0000290 """clear(): Explicitly release parsing objects"""
Martin v. Löwisb417be22001-02-06 01:16:06 +0000291 self.pulldom.clear()
292 del self.pulldom
293 self.parser = None
294 self.stream = None
295
Lars Gustäbelec964d52000-10-13 20:53:27 +0000296class SAX2DOM(PullDOM):
297
298 def startElementNS(self, name, tagName , attrs):
299 PullDOM.startElementNS(self, name, tagName, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000300 curNode = self.elementStack[-1]
301 parentNode = self.elementStack[-2]
302 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000303
304 def startElement(self, name, attrs):
305 PullDOM.startElement(self, name, attrs)
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000306 curNode = self.elementStack[-1]
307 parentNode = self.elementStack[-2]
308 parentNode.appendChild(curNode)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000309
310 def processingInstruction(self, target, data):
311 PullDOM.processingInstruction(self, target, data)
312 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000313 parentNode = self.elementStack[-1]
314 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000315
316 def ignorableWhitespace(self, chars):
317 PullDOM.ignorableWhitespace(self, chars)
318 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000319 parentNode = self.elementStack[-1]
320 parentNode.appendChild(node)
Lars Gustäbelec964d52000-10-13 20:53:27 +0000321
322 def characters(self, chars):
323 PullDOM.characters(self, chars)
324 node = self.lastEvent[0][1]
Martin v. Löwis04a1a542001-01-26 18:53:42 +0000325 parentNode = self.elementStack[-1]
326 parentNode.appendChild(node)
Fred Drake16f63292000-10-23 18:09:50 +0000327
Fred Drakec16adce2000-12-14 18:00:18 +0000328
Fred Drake1f549022000-09-24 05:21:58 +0000329default_bufsize = (2 ** 14) - 20
330
Fred Drakec16adce2000-12-14 18:00:18 +0000331def parse(stream_or_string, parser=None, bufsize=None):
332 if bufsize is None:
333 bufsize = default_bufsize
Guido van Rossum3172c5d2007-10-16 18:12:55 +0000334 if isinstance(stream_or_string, str):
Victor Stinnerbbdc08e2011-07-04 01:25:55 +0200335 stream = open(stream_or_string, 'rb')
Fred Drake55c38192000-06-29 19:39:57 +0000336 else:
Fred Drake1f549022000-09-24 05:21:58 +0000337 stream = stream_or_string
Fred Drake16f63292000-10-23 18:09:50 +0000338 if not parser:
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000339 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000340 return DOMEventStream(stream, parser, bufsize)
Fred Drake55c38192000-06-29 19:39:57 +0000341
Fred Drake1f549022000-09-24 05:21:58 +0000342def parseString(string, parser=None):
Florent Xicluna313b2ad2011-12-10 21:14:53 +0100343 from io import StringIO
Fred Drake16f63292000-10-23 18:09:50 +0000344
Fred Drake1f549022000-09-24 05:21:58 +0000345 bufsize = len(string)
346 buf = StringIO(string)
Martin v. Löwisa13a9dc2000-09-24 21:54:14 +0000347 if not parser:
Thomas Wouters0e3f5912006-08-11 14:57:12 +0000348 parser = xml.sax.make_parser()
Fred Drake1f549022000-09-24 05:21:58 +0000349 return DOMEventStream(buf, parser, bufsize)