Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 1 | import minidom |
Lars Gustäbel | b798c01 | 2000-09-21 08:38:46 +0000 | [diff] [blame] | 2 | import xml.sax |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 3 | |
Lars Gustäbel | e84bf75 | 2000-09-24 18:31:37 +0000 | [diff] [blame] | 4 | #todo: namespace handling |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 5 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 6 | START_ELEMENT = "START_ELEMENT" |
| 7 | END_ELEMENT = "END_ELEMENT" |
| 8 | COMMENT = "COMMENT" |
| 9 | START_DOCUMENT = "START_DOCUMENT" |
| 10 | END_DOCUMENT = "END_DOCUMENT" |
| 11 | PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION" |
| 12 | IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE" |
| 13 | CHARACTERS = "CHARACTERS" |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 14 | |
| 15 | class PullDOM: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 16 | def __init__(self): |
| 17 | self.firstEvent = [None, None] |
| 18 | self.lastEvent = self.firstEvent |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 19 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 20 | def setDocumentLocator(self, locator): pass |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 21 | |
Lars Gustäbel | e84bf75 | 2000-09-24 18:31:37 +0000 | [diff] [blame] | 22 | def startElement(self, name, attrs): |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 23 | if not hasattr(self, "curNode"): |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 24 | # FIXME: hack! |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 25 | self.startDocument() |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 26 | |
Lars Gustäbel | e84bf75 | 2000-09-24 18:31:37 +0000 | [diff] [blame] | 27 | node = self.document.createElement(name) |
| 28 | for (attr, value) in attrs.items(): |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 29 | node.setAttribute(attr, attrs[attr]) |
| 30 | |
| 31 | parent = self.curNode |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 32 | node.parentNode = parent |
| 33 | if parent.childNodes: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 34 | node.previousSibling = parent.childNodes[-1] |
| 35 | node.previousSibling.nextSibling = node |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 36 | self.curNode = node |
Lars Gustäbel | e84bf75 | 2000-09-24 18:31:37 +0000 | [diff] [blame] | 37 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 38 | self.lastEvent[1] = [(START_ELEMENT, node), None] |
| 39 | self.lastEvent = self.lastEvent[1] |
| 40 | #self.events.append((START_ELEMENT, node)) |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 41 | |
Lars Gustäbel | e84bf75 | 2000-09-24 18:31:37 +0000 | [diff] [blame] | 42 | def endElement(self, name): |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 43 | node = self.curNode |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 44 | self.lastEvent[1] = [(END_ELEMENT, node), None] |
| 45 | self.lastEvent = self.lastEvent[1] |
| 46 | #self.events.append((END_ELEMENT, node)) |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 47 | self.curNode = node.parentNode |
| 48 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 49 | def comment(self, s): |
| 50 | node = self.document.createComment(s) |
| 51 | parent = self.curNode |
| 52 | node.parentNode = parent |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 53 | if parent.childNodes: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 54 | node.previousSibling = parent.childNodes[-1] |
| 55 | node.previousSibling.nextSibling = node |
| 56 | self.lastEvent[1] = [(COMMENT, node), None] |
| 57 | self.lastEvent = self.lastEvent[1] |
| 58 | #self.events.append((COMMENT, node)) |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 59 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 60 | def processingInstruction(self, target, data): |
| 61 | node = self.document.createProcessingInstruction(target, data) |
| 62 | #self.appendChild(node) |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 63 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 64 | parent = self.curNode |
| 65 | node.parentNode = parent |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 66 | if parent.childNodes: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 67 | node.previousSibling = parent.childNodes[-1] |
| 68 | node.previousSibling.nextSibling = node |
| 69 | self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None] |
| 70 | self.lastEvent = self.lastEvent[1] |
| 71 | #self.events.append((PROCESSING_INSTRUCTION, node)) |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 72 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 73 | def ignorableWhitespace(self, chars): |
| 74 | node = self.document.createTextNode(chars[start:start + length]) |
| 75 | parent = self.curNode |
| 76 | node.parentNode = parent |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 77 | if parent.childNodes: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 78 | node.previousSibling = parent.childNodes[-1] |
| 79 | node.previousSibling.nextSibling = node |
| 80 | self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None] |
| 81 | self.lastEvent = self.lastEvent[1] |
| 82 | #self.events.append((IGNORABLE_WHITESPACE, node)) |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 83 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 84 | def characters(self, chars): |
| 85 | node = self.document.createTextNode(chars) |
| 86 | node.parentNode = self.curNode |
| 87 | self.lastEvent[1] = [(CHARACTERS, node), None] |
| 88 | self.lastEvent = self.lastEvent[1] |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 89 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 90 | def startDocument(self): |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 91 | node = self.curNode = self.document = minidom.Document() |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 92 | node.parentNode = None |
| 93 | self.lastEvent[1] = [(START_DOCUMENT, node), None] |
| 94 | self.lastEvent = self.lastEvent[1] |
| 95 | #self.events.append((START_DOCUMENT, node)) |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 96 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 97 | def endDocument(self): |
| 98 | assert not self.curNode.parentNode |
| 99 | for node in self.curNode.childNodes: |
| 100 | if node.nodeType == node.ELEMENT_NODE: |
| 101 | self.document.documentElement = node |
| 102 | #if not self.document.documentElement: |
| 103 | # raise Error, "No document element" |
| 104 | |
| 105 | self.lastEvent[1] = [(END_DOCUMENT, node), None] |
| 106 | #self.events.append((END_DOCUMENT, self.curNode)) |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 107 | |
| 108 | class ErrorHandler: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 109 | def warning(self, exception): |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 110 | print exception |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 111 | def error(self, exception): |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 112 | raise exception |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 113 | def fatalError(self, exception): |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 114 | raise exception |
| 115 | |
| 116 | class DOMEventStream: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 117 | def __init__(self, stream, parser, bufsize): |
| 118 | self.stream = stream |
| 119 | self.parser = parser |
| 120 | self.bufsize = bufsize |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 121 | self.reset() |
| 122 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 123 | def reset(self): |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 124 | self.pulldom = PullDOM() |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 125 | self.parser.setContentHandler(self.pulldom) |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 126 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 127 | def __getitem__(self, pos): |
| 128 | rc = self.getEvent() |
| 129 | if rc: |
| 130 | return rc |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 131 | raise IndexError |
| 132 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 133 | def expandNode(self, node): |
| 134 | event = self.getEvent() |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 135 | while event: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 136 | token, cur_node = event |
| 137 | if cur_node is node: |
| 138 | return |
| 139 | if token != END_ELEMENT: |
| 140 | cur_node.parentNode.appendChild(cur_node) |
| 141 | event = self.getEvent() |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 142 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 143 | def getEvent(self): |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 144 | if not self.pulldom.firstEvent[1]: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 145 | self.pulldom.lastEvent = self.pulldom.firstEvent |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 146 | while not self.pulldom.firstEvent[1]: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 147 | buf=self.stream.read(self.bufsize) |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 148 | if not buf: |
| 149 | #FIXME: why doesn't Expat close work? |
| 150 | #self.parser.close() |
| 151 | return None |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 152 | self.parser.feed(buf) |
| 153 | rc = self.pulldom.firstEvent[1][0] |
| 154 | self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1] |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 155 | return rc |
| 156 | |
| 157 | # FIXME: sax2 |
| 158 | #def _getParser( ): |
| 159 | # from xml.sax.saxexts import make_parser |
| 160 | # expat doesn't report errors properly! Figure it out |
| 161 | # return make_parser() |
| 162 | # return make_parser("xml.sax.drivers.drv_xmllib") |
| 163 | |
| 164 | |
| 165 | |
| 166 | def _getParser(): |
Lars Gustäbel | b798c01 | 2000-09-21 08:38:46 +0000 | [diff] [blame] | 167 | return xml.sax.make_parser() |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 168 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 169 | default_bufsize = (2 ** 14) - 20 |
| 170 | |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 171 | # FIXME: move into sax package for common usage |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 172 | def parse(stream_or_string, parser=None, bufsize=default_bufsize): |
| 173 | if type(stream_or_string) is type(""): |
| 174 | stream = open(stream_or_string) |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 175 | else: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 176 | stream = stream_or_string |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 177 | if not parser: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 178 | parser = _getParser() |
| 179 | return DOMEventStream(stream, parser, bufsize) |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 180 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 181 | def parseString(string, parser=None): |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 182 | try: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 183 | from cStringIO import StringIO |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 184 | except ImportError: |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 185 | from StringIO import StringIO |
Fred Drake | 55c3819 | 2000-06-29 19:39:57 +0000 | [diff] [blame] | 186 | |
Fred Drake | 1f54902 | 2000-09-24 05:21:58 +0000 | [diff] [blame] | 187 | bufsize = len(string) |
| 188 | buf = StringIO(string) |
| 189 | parser = _getParser() |
| 190 | return DOMEventStream(buf, parser, bufsize) |