blob: 81e2df70d7a41c67825b9825f861b77667211648 [file] [log] [blame]
Martin v. Löwisfc5fec72003-01-25 15:11:07 +00001"""Facility to use the Expat parser to load a minidom instance
2from a string or file.
3
4This avoids all the overhead of SAX and pulldom to gain performance.
5"""
6
7# Warning!
8#
9# This module is tightly bound to the implementation details of the
10# minidom DOM and can't be used with other DOM implementations. This
11# is due, in part, to a lack of appropriate methods in the DOM (there is
12# no way to create Entity and Notation nodes via the DOM Level 2
13# interface), and for performance. The later is the cause of some fairly
14# cryptic code.
15#
16# Performance hacks:
17#
18# - .character_data_handler() has an extra case in which continuing
19# data is appended to an existing Text node; this can be a
20# speedup since pyexpat can break up character data into multiple
21# callbacks even though we set the buffer_text attribute on the
22# parser. This also gives us the advantage that we don't need a
23# separate normalization pass.
24#
25# - Determining that a node exists is done using an identity comparison
26# with None rather than a truth test; this avoids searching for and
27# calling any methods on the node object if it exists. (A rather
28# nice speedup is achieved this way as well!)
29
Thomas Wouters0e3f5912006-08-11 14:57:12 +000030from xml.dom import xmlbuilder, minidom, Node
31from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
32from xml.parsers import expat
33from xml.dom.minidom import _append_child, _set_attribute_node
34from xml.dom.NodeFilter import NodeFilter
Martin v. Löwisfc5fec72003-01-25 15:11:07 +000035
Martin v. Löwisfc5fec72003-01-25 15:11:07 +000036TEXT_NODE = Node.TEXT_NODE
37CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
38DOCUMENT_NODE = Node.DOCUMENT_NODE
39
40FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
41FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
42FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
43FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
44
45theDOMImplementation = minidom.getDOMImplementation()
46
47# Expat typename -> TypeInfo
48_typeinfo_map = {
49 "CDATA": minidom.TypeInfo(None, "cdata"),
50 "ENUM": minidom.TypeInfo(None, "enumeration"),
51 "ENTITY": minidom.TypeInfo(None, "entity"),
52 "ENTITIES": minidom.TypeInfo(None, "entities"),
53 "ID": minidom.TypeInfo(None, "id"),
54 "IDREF": minidom.TypeInfo(None, "idref"),
55 "IDREFS": minidom.TypeInfo(None, "idrefs"),
56 "NMTOKEN": minidom.TypeInfo(None, "nmtoken"),
57 "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
58 }
59
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000060class ElementInfo(object):
Martin v. Löwisfc5fec72003-01-25 15:11:07 +000061 __slots__ = '_attr_info', '_model', 'tagName'
62
63 def __init__(self, tagName, model=None):
64 self.tagName = tagName
65 self._attr_info = []
66 self._model = model
67
68 def __getstate__(self):
69 return self._attr_info, self._model, self.tagName
70
71 def __setstate__(self, state):
72 self._attr_info, self._model, self.tagName = state
73
74 def getAttributeType(self, aname):
75 for info in self._attr_info:
76 if info[1] == aname:
77 t = info[-2]
78 if t[0] == "(":
79 return _typeinfo_map["ENUM"]
80 else:
81 return _typeinfo_map[info[-2]]
82 return minidom._no_type
83
84 def getAttributeTypeNS(self, namespaceURI, localName):
85 return minidom._no_type
86
87 def isElementContent(self):
88 if self._model:
89 type = self._model[0]
90 return type not in (expat.model.XML_CTYPE_ANY,
91 expat.model.XML_CTYPE_MIXED)
92 else:
93 return False
94
95 def isEmpty(self):
96 if self._model:
97 return self._model[0] == expat.model.XML_CTYPE_EMPTY
98 else:
99 return False
100
101 def isId(self, aname):
102 for info in self._attr_info:
103 if info[1] == aname:
104 return info[-2] == "ID"
105 return False
106
107 def isIdNS(self, euri, ename, auri, aname):
108 # not sure this is meaningful
109 return self.isId((auri, aname))
110
111def _intern(builder, s):
112 return builder._intern_setdefault(s, s)
113
114def _parse_ns_name(builder, name):
115 assert ' ' in name
116 parts = name.split(' ')
117 intern = builder._intern_setdefault
118 if len(parts) == 3:
119 uri, localname, prefix = parts
120 prefix = intern(prefix, prefix)
121 qname = "%s:%s" % (prefix, localname)
122 qname = intern(qname, qname)
123 localname = intern(localname, localname)
124 else:
125 uri, localname = parts
126 prefix = EMPTY_PREFIX
127 qname = localname = intern(localname, localname)
128 return intern(uri, uri), localname, prefix, qname
129
130
131class ExpatBuilder:
132 """Document builder that uses Expat to build a ParsedXML.DOM document
133 instance."""
134
135 def __init__(self, options=None):
136 if options is None:
137 options = xmlbuilder.Options()
138 self._options = options
139 if self._options.filter is not None:
140 self._filter = FilterVisibilityController(self._options.filter)
141 else:
142 self._filter = None
143 # This *really* doesn't do anything in this case, so
144 # override it with something fast & minimal.
145 self._finish_start_element = id
146 self._parser = None
147 self.reset()
148
149 def createParser(self):
150 """Create a new parser object."""
151 return expat.ParserCreate()
152
153 def getParser(self):
154 """Return the parser object, creating a new one if needed."""
155 if not self._parser:
156 self._parser = self.createParser()
157 self._intern_setdefault = self._parser.intern.setdefault
158 self._parser.buffer_text = True
159 self._parser.ordered_attributes = True
160 self._parser.specified_attributes = True
161 self.install(self._parser)
162 return self._parser
163
164 def reset(self):
165 """Free all data structures used during DOM construction."""
166 self.document = theDOMImplementation.createDocument(
167 EMPTY_NAMESPACE, None, None)
168 self.curNode = self.document
169 self._elem_info = self.document._elem_info
170 self._cdata = False
171
172 def install(self, parser):
173 """Install the callbacks needed to build the DOM into the parser."""
174 # This creates circular references!
175 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
176 parser.StartElementHandler = self.first_element_handler
177 parser.EndElementHandler = self.end_element_handler
178 parser.ProcessingInstructionHandler = self.pi_handler
179 if self._options.entities:
180 parser.EntityDeclHandler = self.entity_decl_handler
181 parser.NotationDeclHandler = self.notation_decl_handler
182 if self._options.comments:
183 parser.CommentHandler = self.comment_handler
184 if self._options.cdata_sections:
185 parser.StartCdataSectionHandler = self.start_cdata_section_handler
186 parser.EndCdataSectionHandler = self.end_cdata_section_handler
187 parser.CharacterDataHandler = self.character_data_handler_cdata
188 else:
189 parser.CharacterDataHandler = self.character_data_handler
190 parser.ExternalEntityRefHandler = self.external_entity_ref_handler
191 parser.XmlDeclHandler = self.xml_decl_handler
192 parser.ElementDeclHandler = self.element_decl_handler
193 parser.AttlistDeclHandler = self.attlist_decl_handler
194
195 def parseFile(self, file):
196 """Parse a document from a file object, returning the document
197 node."""
198 parser = self.getParser()
199 first_buffer = True
200 try:
201 while 1:
202 buffer = file.read(16*1024)
203 if not buffer:
204 break
205 parser.Parse(buffer, 0)
206 if first_buffer and self.document.documentElement:
207 self._setup_subset(buffer)
208 first_buffer = False
209 parser.Parse("", True)
210 except ParseEscape:
211 pass
212 doc = self.document
213 self.reset()
214 self._parser = None
215 return doc
216
217 def parseString(self, string):
218 """Parse a document from a string, returning the document node."""
219 parser = self.getParser()
220 try:
221 parser.Parse(string, True)
222 self._setup_subset(string)
223 except ParseEscape:
224 pass
225 doc = self.document
226 self.reset()
227 self._parser = None
228 return doc
229
230 def _setup_subset(self, buffer):
231 """Load the internal subset if there might be one."""
232 if self.document.doctype:
233 extractor = InternalSubsetExtractor()
234 extractor.parseString(buffer)
235 subset = extractor.getSubset()
236 self.document.doctype.internalSubset = subset
237
238 def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
239 has_internal_subset):
240 doctype = self.document.implementation.createDocumentType(
241 doctypeName, publicId, systemId)
242 doctype.ownerDocument = self.document
Georg Brandl297d9722010-07-10 11:40:13 +0000243 _append_child(self.document, doctype)
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000244 self.document.doctype = doctype
245 if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
246 self.document.doctype = None
247 del self.document.childNodes[-1]
248 doctype = None
249 self._parser.EntityDeclHandler = None
250 self._parser.NotationDeclHandler = None
251 if has_internal_subset:
252 if doctype is not None:
253 doctype.entities._seq = []
254 doctype.notations._seq = []
255 self._parser.CommentHandler = None
256 self._parser.ProcessingInstructionHandler = None
257 self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
258
259 def end_doctype_decl_handler(self):
260 if self._options.comments:
261 self._parser.CommentHandler = self.comment_handler
262 self._parser.ProcessingInstructionHandler = self.pi_handler
263 if not (self._elem_info or self._filter):
264 self._finish_end_element = id
265
266 def pi_handler(self, target, data):
267 node = self.document.createProcessingInstruction(target, data)
268 _append_child(self.curNode, node)
269 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
Neal Norwitzbc05fc52003-06-29 04:50:34 +0000270 self.curNode.removeChild(node)
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000271
272 def character_data_handler_cdata(self, data):
273 childNodes = self.curNode.childNodes
274 if self._cdata:
275 if ( self._cdata_continue
276 and childNodes[-1].nodeType == CDATA_SECTION_NODE):
277 childNodes[-1].appendData(data)
278 return
279 node = self.document.createCDATASection(data)
280 self._cdata_continue = True
281 elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
282 node = childNodes[-1]
283 value = node.data + data
Martin v. Löwis14aa2802012-02-19 20:25:12 +0100284 node.data = value
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000285 return
286 else:
287 node = minidom.Text()
Martin v. Löwis14aa2802012-02-19 20:25:12 +0100288 node.data = data
289 node.ownerDocument = self.document
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000290 _append_child(self.curNode, node)
291
292 def character_data_handler(self, data):
293 childNodes = self.curNode.childNodes
294 if childNodes and childNodes[-1].nodeType == TEXT_NODE:
295 node = childNodes[-1]
Martin v. Löwis14aa2802012-02-19 20:25:12 +0100296 node.data = node.data + data
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000297 return
298 node = minidom.Text()
Martin v. Löwis14aa2802012-02-19 20:25:12 +0100299 node.data = node.data + data
300 node.ownerDocument = self.document
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000301 _append_child(self.curNode, node)
302
303 def entity_decl_handler(self, entityName, is_parameter_entity, value,
304 base, systemId, publicId, notationName):
305 if is_parameter_entity:
306 # we don't care about parameter entities for the DOM
307 return
308 if not self._options.entities:
309 return
310 node = self.document._create_entity(entityName, publicId,
311 systemId, notationName)
312 if value is not None:
313 # internal entity
314 # node *should* be readonly, but we'll cheat
315 child = self.document.createTextNode(value)
316 node.childNodes.append(child)
317 self.document.doctype.entities._seq.append(node)
318 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
319 del self.document.doctype.entities._seq[-1]
320
321 def notation_decl_handler(self, notationName, base, systemId, publicId):
322 node = self.document._create_notation(notationName, publicId, systemId)
323 self.document.doctype.notations._seq.append(node)
324 if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
325 del self.document.doctype.notations._seq[-1]
326
327 def comment_handler(self, data):
328 node = self.document.createComment(data)
329 _append_child(self.curNode, node)
330 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
Martin v. Löwis041411a2003-01-26 09:01:30 +0000331 self.curNode.removeChild(node)
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000332
333 def start_cdata_section_handler(self):
334 self._cdata = True
335 self._cdata_continue = False
336
337 def end_cdata_section_handler(self):
338 self._cdata = False
339 self._cdata_continue = False
340
341 def external_entity_ref_handler(self, context, base, systemId, publicId):
342 return 1
343
344 def first_element_handler(self, name, attributes):
345 if self._filter is None and not self._elem_info:
346 self._finish_end_element = id
347 self.getParser().StartElementHandler = self.start_element_handler
348 self.start_element_handler(name, attributes)
349
350 def start_element_handler(self, name, attributes):
351 node = self.document.createElement(name)
352 _append_child(self.curNode, node)
353 self.curNode = node
354
355 if attributes:
356 for i in range(0, len(attributes), 2):
357 a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
358 None, EMPTY_PREFIX)
359 value = attributes[i+1]
Martin v. Löwis14aa2802012-02-19 20:25:12 +0100360 a.value = value
361 a.ownerDocument = self.document
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000362 _set_attribute_node(node, a)
363
364 if node is not self.document.documentElement:
365 self._finish_start_element(node)
366
367 def _finish_start_element(self, node):
368 if self._filter:
369 # To be general, we'd have to call isSameNode(), but this
370 # is sufficient for minidom:
371 if node is self.document.documentElement:
372 return
373 filt = self._filter.startContainer(node)
374 if filt == FILTER_REJECT:
375 # ignore this node & all descendents
376 Rejecter(self)
377 elif filt == FILTER_SKIP:
378 # ignore this node, but make it's children become
379 # children of the parent node
380 Skipper(self)
381 else:
382 return
383 self.curNode = node.parentNode
384 node.parentNode.removeChild(node)
385 node.unlink()
386
387 # If this ever changes, Namespaces.end_element_handler() needs to
388 # be changed to match.
389 #
390 def end_element_handler(self, name):
391 curNode = self.curNode
392 self.curNode = curNode.parentNode
393 self._finish_end_element(curNode)
394
395 def _finish_end_element(self, curNode):
396 info = self._elem_info.get(curNode.tagName)
397 if info:
398 self._handle_white_text_nodes(curNode, info)
399 if self._filter:
400 if curNode is self.document.documentElement:
401 return
402 if self._filter.acceptNode(curNode) == FILTER_REJECT:
403 self.curNode.removeChild(curNode)
404 curNode.unlink()
405
406 def _handle_white_text_nodes(self, node, info):
407 if (self._options.whitespace_in_element_content
408 or not info.isElementContent()):
409 return
410
411 # We have element type information and should remove ignorable
412 # whitespace; identify for text nodes which contain only
413 # whitespace.
414 L = []
415 for child in node.childNodes:
416 if child.nodeType == TEXT_NODE and not child.data.strip():
417 L.append(child)
418
419 # Remove ignorable whitespace from the tree.
420 for child in L:
421 node.removeChild(child)
422
423 def element_decl_handler(self, name, model):
424 info = self._elem_info.get(name)
425 if info is None:
426 self._elem_info[name] = ElementInfo(name, model)
427 else:
428 assert info._model is None
429 info._model = model
430
431 def attlist_decl_handler(self, elem, name, type, default, required):
432 info = self._elem_info.get(elem)
433 if info is None:
434 info = ElementInfo(elem)
435 self._elem_info[elem] = info
436 info._attr_info.append(
437 [None, name, None, None, default, 0, type, required])
438
439 def xml_decl_handler(self, version, encoding, standalone):
440 self.document.version = version
441 self.document.encoding = encoding
442 # This is still a little ugly, thanks to the pyexpat API. ;-(
443 if standalone >= 0:
444 if standalone:
445 self.document.standalone = True
446 else:
447 self.document.standalone = False
448
449
450# Don't include FILTER_INTERRUPT, since that's checked separately
451# where allowed.
452_ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
453
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000454class FilterVisibilityController(object):
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000455 """Wrapper around a DOMBuilderFilter which implements the checks
456 to make the whatToShow filter attribute work."""
457
458 __slots__ = 'filter',
459
460 def __init__(self, filter):
461 self.filter = filter
462
463 def startContainer(self, node):
464 mask = self._nodetype_mask[node.nodeType]
465 if self.filter.whatToShow & mask:
466 val = self.filter.startContainer(node)
467 if val == FILTER_INTERRUPT:
468 raise ParseEscape
469 if val not in _ALLOWED_FILTER_RETURNS:
Collin Winter70e79802007-08-24 18:57:22 +0000470 raise ValueError(
471 "startContainer() returned illegal value: " + repr(val))
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000472 return val
473 else:
474 return FILTER_ACCEPT
475
476 def acceptNode(self, node):
477 mask = self._nodetype_mask[node.nodeType]
478 if self.filter.whatToShow & mask:
479 val = self.filter.acceptNode(node)
480 if val == FILTER_INTERRUPT:
481 raise ParseEscape
482 if val == FILTER_SKIP:
483 # move all child nodes to the parent, and remove this node
484 parent = node.parentNode
485 for child in node.childNodes[:]:
486 parent.appendChild(child)
487 # node is handled by the caller
488 return FILTER_REJECT
489 if val not in _ALLOWED_FILTER_RETURNS:
Collin Winter70e79802007-08-24 18:57:22 +0000490 raise ValueError(
491 "acceptNode() returned illegal value: " + repr(val))
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000492 return val
493 else:
494 return FILTER_ACCEPT
495
496 _nodetype_mask = {
497 Node.ELEMENT_NODE: NodeFilter.SHOW_ELEMENT,
498 Node.ATTRIBUTE_NODE: NodeFilter.SHOW_ATTRIBUTE,
499 Node.TEXT_NODE: NodeFilter.SHOW_TEXT,
500 Node.CDATA_SECTION_NODE: NodeFilter.SHOW_CDATA_SECTION,
501 Node.ENTITY_REFERENCE_NODE: NodeFilter.SHOW_ENTITY_REFERENCE,
502 Node.ENTITY_NODE: NodeFilter.SHOW_ENTITY,
503 Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
504 Node.COMMENT_NODE: NodeFilter.SHOW_COMMENT,
505 Node.DOCUMENT_NODE: NodeFilter.SHOW_DOCUMENT,
506 Node.DOCUMENT_TYPE_NODE: NodeFilter.SHOW_DOCUMENT_TYPE,
507 Node.DOCUMENT_FRAGMENT_NODE: NodeFilter.SHOW_DOCUMENT_FRAGMENT,
508 Node.NOTATION_NODE: NodeFilter.SHOW_NOTATION,
509 }
510
511
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000512class FilterCrutch(object):
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000513 __slots__ = '_builder', '_level', '_old_start', '_old_end'
514
515 def __init__(self, builder):
516 self._level = 0
517 self._builder = builder
518 parser = builder._parser
519 self._old_start = parser.StartElementHandler
520 self._old_end = parser.EndElementHandler
521 parser.StartElementHandler = self.start_element_handler
522 parser.EndElementHandler = self.end_element_handler
523
524class Rejecter(FilterCrutch):
525 __slots__ = ()
526
527 def __init__(self, builder):
528 FilterCrutch.__init__(self, builder)
529 parser = builder._parser
530 for name in ("ProcessingInstructionHandler",
531 "CommentHandler",
532 "CharacterDataHandler",
533 "StartCdataSectionHandler",
534 "EndCdataSectionHandler",
535 "ExternalEntityRefHandler",
536 ):
537 setattr(parser, name, None)
538
539 def start_element_handler(self, *args):
540 self._level = self._level + 1
541
542 def end_element_handler(self, *args):
543 if self._level == 0:
544 # restore the old handlers
545 parser = self._builder._parser
546 self._builder.install(parser)
547 parser.StartElementHandler = self._old_start
548 parser.EndElementHandler = self._old_end
549 else:
550 self._level = self._level - 1
551
552class Skipper(FilterCrutch):
553 __slots__ = ()
554
555 def start_element_handler(self, *args):
556 node = self._builder.curNode
557 self._old_start(*args)
558 if self._builder.curNode is not node:
559 self._level = self._level + 1
560
561 def end_element_handler(self, *args):
562 if self._level == 0:
563 # We're popping back out of the node we're skipping, so we
564 # shouldn't need to do anything but reset the handlers.
565 self._builder._parser.StartElementHandler = self._old_start
566 self._builder._parser.EndElementHandler = self._old_end
567 self._builder = None
568 else:
569 self._level = self._level - 1
570 self._old_end(*args)
571
572
573# framework document used by the fragment builder.
574# Takes a string for the doctype, subset string, and namespace attrs string.
575
576_FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
577 "http://xml.python.org/entities/fragment-builder/internal"
578
579_FRAGMENT_BUILDER_TEMPLATE = (
580 '''\
581<!DOCTYPE wrapper
582 %%s [
583 <!ENTITY fragment-builder-internal
584 SYSTEM "%s">
585%%s
586]>
587<wrapper %%s
588>&fragment-builder-internal;</wrapper>'''
589 % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
590
591
592class FragmentBuilder(ExpatBuilder):
593 """Builder which constructs document fragments given XML source
594 text and a context node.
595
596 The context node is expected to provide information about the
597 namespace declarations which are in scope at the start of the
598 fragment.
599 """
600
601 def __init__(self, context, options=None):
602 if context.nodeType == DOCUMENT_NODE:
603 self.originalDocument = context
604 self.context = context
605 else:
606 self.originalDocument = context.ownerDocument
607 self.context = context
608 ExpatBuilder.__init__(self, options)
609
610 def reset(self):
611 ExpatBuilder.reset(self)
612 self.fragment = None
613
614 def parseFile(self, file):
615 """Parse a document fragment from a file object, returning the
616 fragment node."""
617 return self.parseString(file.read())
618
619 def parseString(self, string):
620 """Parse a document fragment from a string, returning the
621 fragment node."""
622 self._source = string
623 parser = self.getParser()
624 doctype = self.originalDocument.doctype
625 ident = ""
626 if doctype:
627 subset = doctype.internalSubset or self._getDeclarations()
628 if doctype.publicId:
629 ident = ('PUBLIC "%s" "%s"'
630 % (doctype.publicId, doctype.systemId))
631 elif doctype.systemId:
632 ident = 'SYSTEM "%s"' % doctype.systemId
633 else:
634 subset = ""
635 nsattrs = self._getNSattrs() # get ns decls from node's ancestors
636 document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
637 try:
638 parser.Parse(document, 1)
639 except:
640 self.reset()
641 raise
642 fragment = self.fragment
643 self.reset()
644## self._parser = None
645 return fragment
646
647 def _getDeclarations(self):
648 """Re-create the internal subset from the DocumentType node.
649
650 This is only needed if we don't already have the
651 internalSubset as a string.
652 """
653 doctype = self.context.ownerDocument.doctype
654 s = ""
655 if doctype:
656 for i in range(doctype.notations.length):
657 notation = doctype.notations.item(i)
658 if s:
659 s = s + "\n "
660 s = "%s<!NOTATION %s" % (s, notation.nodeName)
661 if notation.publicId:
662 s = '%s PUBLIC "%s"\n "%s">' \
663 % (s, notation.publicId, notation.systemId)
664 else:
665 s = '%s SYSTEM "%s">' % (s, notation.systemId)
666 for i in range(doctype.entities.length):
667 entity = doctype.entities.item(i)
668 if s:
669 s = s + "\n "
670 s = "%s<!ENTITY %s" % (s, entity.nodeName)
671 if entity.publicId:
672 s = '%s PUBLIC "%s"\n "%s"' \
673 % (s, entity.publicId, entity.systemId)
674 elif entity.systemId:
675 s = '%s SYSTEM "%s"' % (s, entity.systemId)
676 else:
677 s = '%s "%s"' % (s, entity.firstChild.data)
678 if entity.notationName:
679 s = "%s NOTATION %s" % (s, entity.notationName)
680 s = s + ">"
681 return s
682
683 def _getNSattrs(self):
684 return ""
685
686 def external_entity_ref_handler(self, context, base, systemId, publicId):
687 if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
688 # this entref is the one that we made to put the subtree
689 # in; all of our given input is parsed in here.
690 old_document = self.document
691 old_cur_node = self.curNode
692 parser = self._parser.ExternalEntityParserCreate(context)
693 # put the real document back, parse into the fragment to return
694 self.document = self.originalDocument
695 self.fragment = self.document.createDocumentFragment()
696 self.curNode = self.fragment
697 try:
698 parser.Parse(self._source, 1)
699 finally:
700 self.curNode = old_cur_node
701 self.document = old_document
702 self._source = None
703 return -1
704 else:
705 return ExpatBuilder.external_entity_ref_handler(
706 self, context, base, systemId, publicId)
707
708
709class Namespaces:
710 """Mix-in class for builders; adds support for namespaces."""
711
712 def _initNamespaces(self):
713 # list of (prefix, uri) ns declarations. Namespace attrs are
714 # constructed from this and added to the element's attrs.
715 self._ns_ordered_prefixes = []
716
717 def createParser(self):
718 """Create a new namespace-handling parser."""
719 parser = expat.ParserCreate(namespace_separator=" ")
720 parser.namespace_prefixes = True
721 return parser
722
723 def install(self, parser):
724 """Insert the namespace-handlers onto the parser."""
725 ExpatBuilder.install(self, parser)
726 if self._options.namespace_declarations:
727 parser.StartNamespaceDeclHandler = (
728 self.start_namespace_decl_handler)
729
730 def start_namespace_decl_handler(self, prefix, uri):
731 """Push this namespace declaration on our storage."""
732 self._ns_ordered_prefixes.append((prefix, uri))
733
734 def start_element_handler(self, name, attributes):
735 if ' ' in name:
736 uri, localname, prefix, qname = _parse_ns_name(self, name)
737 else:
738 uri = EMPTY_NAMESPACE
739 qname = name
740 localname = None
741 prefix = EMPTY_PREFIX
742 node = minidom.Element(qname, uri, prefix, localname)
743 node.ownerDocument = self.document
744 _append_child(self.curNode, node)
745 self.curNode = node
746
747 if self._ns_ordered_prefixes:
748 for prefix, uri in self._ns_ordered_prefixes:
749 if prefix:
750 a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
751 XMLNS_NAMESPACE, prefix, "xmlns")
752 else:
753 a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
754 "xmlns", EMPTY_PREFIX)
Martin v. Löwis14aa2802012-02-19 20:25:12 +0100755 a.value = uri
Florent Xicluna6c753012012-03-05 12:35:15 +0100756 a.ownerDocument = self.document
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000757 _set_attribute_node(node, a)
758 del self._ns_ordered_prefixes[:]
759
760 if attributes:
Martin v. Löwis7b771882012-02-19 20:55:05 +0100761 node._ensure_attributes()
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000762 _attrs = node._attrs
763 _attrsNS = node._attrsNS
764 for i in range(0, len(attributes), 2):
765 aname = attributes[i]
766 value = attributes[i+1]
767 if ' ' in aname:
768 uri, localname, prefix, qname = _parse_ns_name(self, aname)
769 a = minidom.Attr(qname, uri, localname, prefix)
770 _attrs[qname] = a
771 _attrsNS[(uri, localname)] = a
772 else:
773 a = minidom.Attr(aname, EMPTY_NAMESPACE,
774 aname, EMPTY_PREFIX)
775 _attrs[aname] = a
776 _attrsNS[(EMPTY_NAMESPACE, aname)] = a
Martin v. Löwis14aa2802012-02-19 20:25:12 +0100777 a.ownerDocument = self.document
778 a.value = value
779 a.ownerElement = node
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000780
781 if __debug__:
782 # This only adds some asserts to the original
783 # end_element_handler(), so we only define this when -O is not
784 # used. If changing one, be sure to check the other to see if
785 # it needs to be changed as well.
786 #
787 def end_element_handler(self, name):
788 curNode = self.curNode
789 if ' ' in name:
790 uri, localname, prefix, qname = _parse_ns_name(self, name)
791 assert (curNode.namespaceURI == uri
792 and curNode.localName == localname
793 and curNode.prefix == prefix), \
794 "element stack messed up! (namespace)"
795 else:
796 assert curNode.nodeName == name, \
797 "element stack messed up - bad nodeName"
798 assert curNode.namespaceURI == EMPTY_NAMESPACE, \
799 "element stack messed up - bad namespaceURI"
800 self.curNode = curNode.parentNode
801 self._finish_end_element(curNode)
802
803
804class ExpatBuilderNS(Namespaces, ExpatBuilder):
805 """Document builder that supports namespaces."""
806
807 def reset(self):
808 ExpatBuilder.reset(self)
809 self._initNamespaces()
810
811
812class FragmentBuilderNS(Namespaces, FragmentBuilder):
813 """Fragment builder that supports namespaces."""
814
815 def reset(self):
816 FragmentBuilder.reset(self)
817 self._initNamespaces()
818
819 def _getNSattrs(self):
820 """Return string of namespace attributes from this element and
821 ancestors."""
822 # XXX This needs to be re-written to walk the ancestors of the
823 # context to build up the namespace information from
824 # declarations, elements, and attributes found in context.
825 # Otherwise we have to store a bunch more data on the DOM
826 # (though that *might* be more reliable -- not clear).
827 attrs = ""
828 context = self.context
829 L = []
830 while context:
831 if hasattr(context, '_ns_prefix_uri'):
832 for prefix, uri in context._ns_prefix_uri.items():
833 # add every new NS decl from context to L and attrs string
834 if prefix in L:
835 continue
836 L.append(prefix)
837 if prefix:
838 declname = "xmlns:" + prefix
839 else:
840 declname = "xmlns"
841 if attrs:
842 attrs = "%s\n %s='%s'" % (attrs, declname, uri)
843 else:
844 attrs = " %s='%s'" % (declname, uri)
845 context = context.parentNode
846 return attrs
847
848
849class ParseEscape(Exception):
850 """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
851 pass
852
853class InternalSubsetExtractor(ExpatBuilder):
854 """XML processor which can rip out the internal document type subset."""
855
856 subset = None
857
858 def getSubset(self):
859 """Return the internal subset as a string."""
860 return self.subset
861
862 def parseFile(self, file):
863 try:
864 ExpatBuilder.parseFile(self, file)
865 except ParseEscape:
866 pass
867
868 def parseString(self, string):
869 try:
870 ExpatBuilder.parseString(self, string)
871 except ParseEscape:
872 pass
873
874 def install(self, parser):
875 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
876 parser.StartElementHandler = self.start_element_handler
877
878 def start_doctype_decl_handler(self, name, publicId, systemId,
879 has_internal_subset):
880 if has_internal_subset:
881 parser = self.getParser()
882 self.subset = []
883 parser.DefaultHandler = self.subset.append
884 parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
885 else:
886 raise ParseEscape()
887
888 def end_doctype_decl_handler(self):
889 s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
890 self.subset = s
891 raise ParseEscape()
892
893 def start_element_handler(self, name, attrs):
894 raise ParseEscape()
895
896
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000897def parse(file, namespaces=True):
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000898 """Parse a document, returning the resulting Document node.
899
900 'file' may be either a file name or an open file object.
901 """
902 if namespaces:
903 builder = ExpatBuilderNS()
904 else:
905 builder = ExpatBuilder()
906
Christian Heimesc9543e42007-11-28 08:28:28 +0000907 if isinstance(file, str):
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +0100908 with open(file, 'rb') as fp:
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000909 result = builder.parseFile(fp)
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000910 else:
911 result = builder.parseFile(file)
912 return result
913
914
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000915def parseString(string, namespaces=True):
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000916 """Parse a document from a string, returning the resulting
917 Document node.
918 """
919 if namespaces:
920 builder = ExpatBuilderNS()
921 else:
922 builder = ExpatBuilder()
923 return builder.parseString(string)
924
925
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000926def parseFragment(file, context, namespaces=True):
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000927 """Parse a fragment of a document, given the context from which it
928 was originally extracted. context should be the parent of the
929 node(s) which are in the fragment.
930
931 'file' may be either a file name or an open file object.
932 """
933 if namespaces:
934 builder = FragmentBuilderNS(context)
935 else:
936 builder = FragmentBuilder(context)
937
Christian Heimesc9543e42007-11-28 08:28:28 +0000938 if isinstance(file, str):
Giampaolo Rodola'2f50aaf2013-02-12 02:04:27 +0100939 with open(file, 'rb') as fp:
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000940 result = builder.parseFile(fp)
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000941 else:
942 result = builder.parseFile(file)
943 return result
944
945
Thomas Wouters49fd7fa2006-04-21 10:40:58 +0000946def parseFragmentString(string, context, namespaces=True):
Martin v. Löwisfc5fec72003-01-25 15:11:07 +0000947 """Parse a fragment of a document from a string, given the context
948 from which it was originally extracted. context should be the
949 parent of the node(s) which are in the fragment.
950 """
951 if namespaces:
952 builder = FragmentBuilderNS(context)
953 else:
954 builder = FragmentBuilder(context)
955 return builder.parseString(string)
956
957
958def makeBuilder(options):
959 """Create a builder based on an Options object."""
960 if options.namespaces:
961 return ExpatBuilderNS(options)
962 else:
963 return ExpatBuilder(options)