blob: 463276b0676e5aac2aebd761c4b183c5f3f2e614 [file] [log] [blame]
Jean-Paul Calderone897bc252008-02-18 20:50:23 -05001#! /usr/bin/env python
2
3"""Perform massive transformations on a document tree created from the LaTeX
4of the Python documentation, and dump the ESIS data for the transformed tree.
5"""
6
7
8import errno
9import esistools
10import re
11import string
12import sys
13import xml.dom
14import xml.dom.minidom
15
16ELEMENT = xml.dom.Node.ELEMENT_NODE
17ENTITY_REFERENCE = xml.dom.Node.ENTITY_REFERENCE_NODE
18TEXT = xml.dom.Node.TEXT_NODE
19
20
21class ConversionError(Exception):
22 pass
23
24
25ewrite = sys.stderr.write
26try:
27 # We can only do this trick on Unix (if tput is on $PATH)!
28 if sys.platform != "posix" or not sys.stderr.isatty():
29 raise ImportError
30 import commands
31except ImportError:
32 bwrite = ewrite
33else:
34 def bwrite(s, BOLDON=commands.getoutput("tput bold"),
35 BOLDOFF=commands.getoutput("tput sgr0")):
36 ewrite("%s%s%s" % (BOLDON, s, BOLDOFF))
37
38
39PARA_ELEMENT = "para"
40
41DEBUG_PARA_FIXER = 0
42
43if DEBUG_PARA_FIXER:
44 def para_msg(s):
45 ewrite("*** %s\n" % s)
46else:
47 def para_msg(s):
48 pass
49
50
51def get_first_element(doc, gi):
52 for n in doc.childNodes:
53 if n.nodeName == gi:
54 return n
55
56def extract_first_element(doc, gi):
57 node = get_first_element(doc, gi)
58 if node is not None:
59 doc.removeChild(node)
60 return node
61
62
63def get_documentElement(node):
64 result = None
65 for child in node.childNodes:
66 if child.nodeType == ELEMENT:
67 result = child
68 return result
69
70
71def set_tagName(elem, gi):
72 elem.nodeName = elem.tagName = gi
73
74
75def find_all_elements(doc, gi):
76 nodes = []
77 if doc.nodeName == gi:
78 nodes.append(doc)
79 for child in doc.childNodes:
80 if child.nodeType == ELEMENT:
81 if child.tagName == gi:
82 nodes.append(child)
83 for node in child.getElementsByTagName(gi):
84 nodes.append(node)
85 return nodes
86
87def find_all_child_elements(doc, gi):
88 nodes = []
89 for child in doc.childNodes:
90 if child.nodeName == gi:
91 nodes.append(child)
92 return nodes
93
94
95def find_all_elements_from_set(doc, gi_set):
96 return __find_all_elements_from_set(doc, gi_set, [])
97
98def __find_all_elements_from_set(doc, gi_set, nodes):
99 if doc.nodeName in gi_set:
100 nodes.append(doc)
101 for child in doc.childNodes:
102 if child.nodeType == ELEMENT:
103 __find_all_elements_from_set(child, gi_set, nodes)
104 return nodes
105
106
107def simplify(doc, fragment):
108 # Try to rationalize the document a bit, since these things are simply
109 # not valid SGML/XML documents as they stand, and need a little work.
110 documentclass = "document"
111 inputs = []
112 node = extract_first_element(fragment, "documentclass")
113 if node is not None:
114 documentclass = node.getAttribute("classname")
115 node = extract_first_element(fragment, "title")
116 if node is not None:
117 inputs.append(node)
118 # update the name of the root element
119 node = get_first_element(fragment, "document")
120 if node is not None:
121 set_tagName(node, documentclass)
122 while 1:
123 node = extract_first_element(fragment, "input")
124 if node is None:
125 break
126 inputs.append(node)
127 if inputs:
128 docelem = get_documentElement(fragment)
129 inputs.reverse()
130 for node in inputs:
131 text = doc.createTextNode("\n")
132 docelem.insertBefore(text, docelem.firstChild)
133 docelem.insertBefore(node, text)
134 docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
135 while fragment.firstChild and fragment.firstChild.nodeType == TEXT:
136 fragment.removeChild(fragment.firstChild)
137
138
139def cleanup_root_text(doc):
140 discards = []
141 skip = 0
142 for n in doc.childNodes:
143 prevskip = skip
144 skip = 0
145 if n.nodeType == TEXT and not prevskip:
146 discards.append(n)
147 elif n.nodeName == "COMMENT":
148 skip = 1
149 for node in discards:
150 doc.removeChild(node)
151
152
153DESCRIPTOR_ELEMENTS = (
154 "cfuncdesc", "cvardesc", "ctypedesc",
155 "classdesc", "memberdesc", "memberdescni", "methoddesc", "methoddescni",
156 "excdesc", "funcdesc", "funcdescni", "opcodedesc",
157 "datadesc", "datadescni",
158 )
159
160def fixup_descriptors(doc, fragment):
161 sections = find_all_elements(fragment, "section")
162 for section in sections:
163 find_and_fix_descriptors(doc, section)
164
165
166def find_and_fix_descriptors(doc, container):
167 children = container.childNodes
168 for child in children:
169 if child.nodeType == ELEMENT:
170 tagName = child.tagName
171 if tagName in DESCRIPTOR_ELEMENTS:
172 rewrite_descriptor(doc, child)
173 elif tagName == "subsection":
174 find_and_fix_descriptors(doc, child)
175
176
177def rewrite_descriptor(doc, descriptor):
178 #
179 # Do these things:
180 # 1. Add an "index='no'" attribute to the element if the tagName
181 # ends in 'ni', removing the 'ni' from the name.
182 # 2. Create a <signature> from the name attribute
183 # 2a.Create an <args> if it appears to be available.
184 # 3. Create additional <signature>s from <*line{,ni}> elements,
185 # if found.
186 # 4. If a <versionadded> is found, move it to an attribute on the
187 # descriptor.
188 # 5. Move remaining child nodes to a <description> element.
189 # 6. Put it back together.
190 #
191 # 1.
192 descname = descriptor.tagName
193 index = 1
194 if descname[-2:] == "ni":
195 descname = descname[:-2]
196 descriptor.setAttribute("index", "no")
197 set_tagName(descriptor, descname)
198 index = 0
199 desctype = descname[:-4] # remove 'desc'
200 linename = desctype + "line"
201 if not index:
202 linename = linename + "ni"
203 # 2.
204 signature = doc.createElement("signature")
205 name = doc.createElement("name")
206 signature.appendChild(doc.createTextNode("\n "))
207 signature.appendChild(name)
208 name.appendChild(doc.createTextNode(descriptor.getAttribute("name")))
209 descriptor.removeAttribute("name")
210 # 2a.
211 if descriptor.hasAttribute("var"):
212 if descname != "opcodedesc":
213 raise RuntimeError, \
214 "got 'var' attribute on descriptor other than opcodedesc"
215 variable = descriptor.getAttribute("var")
216 if variable:
217 args = doc.createElement("args")
218 args.appendChild(doc.createTextNode(variable))
219 signature.appendChild(doc.createTextNode("\n "))
220 signature.appendChild(args)
221 descriptor.removeAttribute("var")
222 newchildren = [signature]
223 children = descriptor.childNodes
224 pos = skip_leading_nodes(children)
225 if pos < len(children):
226 child = children[pos]
227 if child.nodeName == "args":
228 # move <args> to <signature>, or remove if empty:
229 child.parentNode.removeChild(child)
230 if len(child.childNodes):
231 signature.appendChild(doc.createTextNode("\n "))
232 signature.appendChild(child)
233 signature.appendChild(doc.createTextNode("\n "))
234 # 3, 4.
235 pos = skip_leading_nodes(children, pos)
236 while pos < len(children) \
237 and children[pos].nodeName in (linename, "versionadded"):
238 if children[pos].tagName == linename:
239 # this is really a supplemental signature, create <signature>
240 oldchild = children[pos].cloneNode(1)
241 try:
242 sig = methodline_to_signature(doc, children[pos])
243 except KeyError:
244 print oldchild.toxml()
245 raise
246 newchildren.append(sig)
247 else:
248 # <versionadded added=...>
249 descriptor.setAttribute(
250 "added", children[pos].getAttribute("version"))
251 pos = skip_leading_nodes(children, pos + 1)
252 # 5.
253 description = doc.createElement("description")
254 description.appendChild(doc.createTextNode("\n"))
255 newchildren.append(description)
256 move_children(descriptor, description, pos)
257 last = description.childNodes[-1]
258 if last.nodeType == TEXT:
259 last.data = string.rstrip(last.data) + "\n "
260 # 6.
261 # should have nothing but whitespace and signature lines in <descriptor>;
262 # discard them
263 while descriptor.childNodes:
264 descriptor.removeChild(descriptor.childNodes[0])
265 for node in newchildren:
266 descriptor.appendChild(doc.createTextNode("\n "))
267 descriptor.appendChild(node)
268 descriptor.appendChild(doc.createTextNode("\n"))
269
270
271def methodline_to_signature(doc, methodline):
272 signature = doc.createElement("signature")
273 signature.appendChild(doc.createTextNode("\n "))
274 name = doc.createElement("name")
275 name.appendChild(doc.createTextNode(methodline.getAttribute("name")))
276 methodline.removeAttribute("name")
277 signature.appendChild(name)
278 if len(methodline.childNodes):
279 args = doc.createElement("args")
280 signature.appendChild(doc.createTextNode("\n "))
281 signature.appendChild(args)
282 move_children(methodline, args)
283 signature.appendChild(doc.createTextNode("\n "))
284 return signature
285
286
287def move_children(origin, dest, start=0):
288 children = origin.childNodes
289 while start < len(children):
290 node = children[start]
291 origin.removeChild(node)
292 dest.appendChild(node)
293
294
295def handle_appendix(doc, fragment):
296 # must be called after simplfy() if document is multi-rooted to begin with
297 docelem = get_documentElement(fragment)
298 toplevel = docelem.tagName == "manual" and "chapter" or "section"
299 appendices = 0
300 nodes = []
301 for node in docelem.childNodes:
302 if appendices:
303 nodes.append(node)
304 elif node.nodeType == ELEMENT:
305 appnodes = node.getElementsByTagName("appendix")
306 if appnodes:
307 appendices = 1
308 parent = appnodes[0].parentNode
309 parent.removeChild(appnodes[0])
310 parent.normalize()
311 if nodes:
312 map(docelem.removeChild, nodes)
313 docelem.appendChild(doc.createTextNode("\n\n\n"))
314 back = doc.createElement("back-matter")
315 docelem.appendChild(back)
316 back.appendChild(doc.createTextNode("\n"))
317 while nodes and nodes[0].nodeType == TEXT \
318 and not string.strip(nodes[0].data):
319 del nodes[0]
320 map(back.appendChild, nodes)
321 docelem.appendChild(doc.createTextNode("\n"))
322
323
324def handle_labels(doc, fragment):
325 for label in find_all_elements(fragment, "label"):
326 id = label.getAttribute("id")
327 if not id:
328 continue
329 parent = label.parentNode
330 parentTagName = parent.tagName
331 if parentTagName == "title":
332 parent.parentNode.setAttribute("id", id)
333 else:
334 parent.setAttribute("id", id)
335 # now, remove <label id="..."/> from parent:
336 parent.removeChild(label)
337 if parentTagName == "title":
338 parent.normalize()
339 children = parent.childNodes
340 if children[-1].nodeType == TEXT:
341 children[-1].data = string.rstrip(children[-1].data)
342
343
344def fixup_trailing_whitespace(doc, wsmap):
345 queue = [doc]
346 while queue:
347 node = queue[0]
348 del queue[0]
349 if wsmap.has_key(node.nodeName):
350 ws = wsmap[node.tagName]
351 children = node.childNodes
352 children.reverse()
353 if children[0].nodeType == TEXT:
354 data = string.rstrip(children[0].data) + ws
355 children[0].data = data
356 children.reverse()
357 # hack to get the title in place:
358 if node.tagName == "title" \
359 and node.parentNode.firstChild.nodeType == ELEMENT:
360 node.parentNode.insertBefore(doc.createText("\n "),
361 node.parentNode.firstChild)
362 for child in node.childNodes:
363 if child.nodeType == ELEMENT:
364 queue.append(child)
365
366
367def normalize(doc):
368 for node in doc.childNodes:
369 if node.nodeType == ELEMENT:
370 node.normalize()
371
372
373def cleanup_trailing_parens(doc, element_names):
374 d = {}
375 for gi in element_names:
376 d[gi] = gi
377 rewrite_element = d.has_key
378 queue = []
379 for node in doc.childNodes:
380 if node.nodeType == ELEMENT:
381 queue.append(node)
382 while queue:
383 node = queue[0]
384 del queue[0]
385 if rewrite_element(node.tagName):
386 children = node.childNodes
387 if len(children) == 1 \
388 and children[0].nodeType == TEXT:
389 data = children[0].data
390 if data[-2:] == "()":
391 children[0].data = data[:-2]
392 else:
393 for child in node.childNodes:
394 if child.nodeType == ELEMENT:
395 queue.append(child)
396
397
398def contents_match(left, right):
399 left_children = left.childNodes
400 right_children = right.childNodes
401 if len(left_children) != len(right_children):
402 return 0
403 for l, r in map(None, left_children, right_children):
404 nodeType = l.nodeType
405 if nodeType != r.nodeType:
406 return 0
407 if nodeType == ELEMENT:
408 if l.tagName != r.tagName:
409 return 0
410 # should check attributes, but that's not a problem here
411 if not contents_match(l, r):
412 return 0
413 elif nodeType == TEXT:
414 if l.data != r.data:
415 return 0
416 else:
417 # not quite right, but good enough
418 return 0
419 return 1
420
421
422def create_module_info(doc, section):
423 # Heavy.
424 node = extract_first_element(section, "modulesynopsis")
425 if node is None:
426 return
427 set_tagName(node, "synopsis")
428 lastchild = node.childNodes[-1]
429 if lastchild.nodeType == TEXT \
430 and lastchild.data[-1:] == ".":
431 lastchild.data = lastchild.data[:-1]
432 modauthor = extract_first_element(section, "moduleauthor")
433 if modauthor:
434 set_tagName(modauthor, "author")
435 modauthor.appendChild(doc.createTextNode(
436 modauthor.getAttribute("name")))
437 modauthor.removeAttribute("name")
438 platform = extract_first_element(section, "platform")
439 if section.tagName == "section":
440 modinfo_pos = 2
441 modinfo = doc.createElement("moduleinfo")
442 moddecl = extract_first_element(section, "declaremodule")
443 name = None
444 if moddecl:
445 modinfo.appendChild(doc.createTextNode("\n "))
446 name = moddecl.attributes["name"].value
447 namenode = doc.createElement("name")
448 namenode.appendChild(doc.createTextNode(name))
449 modinfo.appendChild(namenode)
450 type = moddecl.attributes.get("type")
451 if type:
452 type = type.value
453 modinfo.appendChild(doc.createTextNode("\n "))
454 typenode = doc.createElement("type")
455 typenode.appendChild(doc.createTextNode(type))
456 modinfo.appendChild(typenode)
457 versionadded = extract_first_element(section, "versionadded")
458 if versionadded:
459 modinfo.setAttribute("added", versionadded.getAttribute("version"))
460 title = get_first_element(section, "title")
461 if title:
462 children = title.childNodes
463 if len(children) >= 2 \
464 and children[0].nodeName == "module" \
465 and children[0].childNodes[0].data == name:
466 # this is it; morph the <title> into <short-synopsis>
467 first_data = children[1]
468 if first_data.data[:4] == " ---":
469 first_data.data = string.lstrip(first_data.data[4:])
470 set_tagName(title, "short-synopsis")
471 if children[-1].nodeType == TEXT \
472 and children[-1].data[-1:] == ".":
473 children[-1].data = children[-1].data[:-1]
474 section.removeChild(title)
475 section.removeChild(section.childNodes[0])
476 title.removeChild(children[0])
477 modinfo_pos = 0
478 else:
479 ewrite("module name in title doesn't match"
480 " <declaremodule/>; no <short-synopsis/>\n")
481 else:
482 ewrite("Unexpected condition: <section/> without <title/>\n")
483 modinfo.appendChild(doc.createTextNode("\n "))
484 modinfo.appendChild(node)
485 if title and not contents_match(title, node):
486 # The short synopsis is actually different,
487 # and needs to be stored:
488 modinfo.appendChild(doc.createTextNode("\n "))
489 modinfo.appendChild(title)
490 if modauthor:
491 modinfo.appendChild(doc.createTextNode("\n "))
492 modinfo.appendChild(modauthor)
493 if platform:
494 modinfo.appendChild(doc.createTextNode("\n "))
495 modinfo.appendChild(platform)
496 modinfo.appendChild(doc.createTextNode("\n "))
497 section.insertBefore(modinfo, section.childNodes[modinfo_pos])
498 section.insertBefore(doc.createTextNode("\n "), modinfo)
499 #
500 # The rest of this removes extra newlines from where we cut out
501 # a lot of elements. A lot of code for minimal value, but keeps
502 # keeps the generated *ML from being too funny looking.
503 #
504 section.normalize()
505 children = section.childNodes
506 for i in range(len(children)):
507 node = children[i]
508 if node.nodeName == "moduleinfo":
509 nextnode = children[i+1]
510 if nextnode.nodeType == TEXT:
511 data = nextnode.data
512 if len(string.lstrip(data)) < (len(data) - 4):
513 nextnode.data = "\n\n\n" + string.lstrip(data)
514
515
516def cleanup_synopses(doc, fragment):
517 for node in find_all_elements(fragment, "section"):
518 create_module_info(doc, node)
519
520
521def fixup_table_structures(doc, fragment):
522 for table in find_all_elements(fragment, "table"):
523 fixup_table(doc, table)
524
525
526def fixup_table(doc, table):
527 # create the table head
528 thead = doc.createElement("thead")
529 row = doc.createElement("row")
530 move_elements_by_name(doc, table, row, "entry")
531 thead.appendChild(doc.createTextNode("\n "))
532 thead.appendChild(row)
533 thead.appendChild(doc.createTextNode("\n "))
534 # create the table body
535 tbody = doc.createElement("tbody")
536 prev_row = None
537 last_was_hline = 0
538 children = table.childNodes
539 for child in children:
540 if child.nodeType == ELEMENT:
541 tagName = child.tagName
542 if tagName == "hline" and prev_row is not None:
543 prev_row.setAttribute("rowsep", "1")
544 elif tagName == "row":
545 prev_row = child
546 # save the rows:
547 tbody.appendChild(doc.createTextNode("\n "))
548 move_elements_by_name(doc, table, tbody, "row", sep="\n ")
549 # and toss the rest:
550 while children:
551 child = children[0]
552 nodeType = child.nodeType
553 if nodeType == TEXT:
554 if string.strip(child.data):
555 raise ConversionError("unexpected free data in <%s>: %r"
556 % (table.tagName, child.data))
557 table.removeChild(child)
558 continue
559 if nodeType == ELEMENT:
560 if child.tagName != "hline":
561 raise ConversionError(
562 "unexpected <%s> in table" % child.tagName)
563 table.removeChild(child)
564 continue
565 raise ConversionError(
566 "unexpected %s node in table" % child.__class__.__name__)
567 # nothing left in the <table>; add the <thead> and <tbody>
568 tgroup = doc.createElement("tgroup")
569 tgroup.appendChild(doc.createTextNode("\n "))
570 tgroup.appendChild(thead)
571 tgroup.appendChild(doc.createTextNode("\n "))
572 tgroup.appendChild(tbody)
573 tgroup.appendChild(doc.createTextNode("\n "))
574 table.appendChild(tgroup)
575 # now make the <entry>s look nice:
576 for row in table.getElementsByTagName("row"):
577 fixup_row(doc, row)
578
579
580def fixup_row(doc, row):
581 entries = []
582 map(entries.append, row.childNodes[1:])
583 for entry in entries:
584 row.insertBefore(doc.createTextNode("\n "), entry)
585# row.appendChild(doc.createTextNode("\n "))
586
587
588def move_elements_by_name(doc, source, dest, name, sep=None):
589 nodes = []
590 for child in source.childNodes:
591 if child.nodeName == name:
592 nodes.append(child)
593 for node in nodes:
594 source.removeChild(node)
595 dest.appendChild(node)
596 if sep:
597 dest.appendChild(doc.createTextNode(sep))
598
599
600RECURSE_INTO_PARA_CONTAINERS = (
601 "chapter", "abstract", "enumerate",
602 "section", "subsection", "subsubsection",
603 "paragraph", "subparagraph", "back-matter",
604 "howto", "manual",
605 "item", "itemize", "fulllineitems", "enumeration", "descriptionlist",
606 "definitionlist", "definition",
607 )
608
609PARA_LEVEL_ELEMENTS = (
610 "moduleinfo", "title", "verbatim", "enumerate", "item",
611 "interpreter-session", "back-matter", "interactive-session",
612 "opcodedesc", "classdesc", "datadesc",
613 "funcdesc", "methoddesc", "excdesc", "memberdesc", "membderdescni",
614 "funcdescni", "methoddescni", "excdescni",
615 "tableii", "tableiii", "tableiv", "localmoduletable",
616 "sectionauthor", "seealso", "itemize",
617 # include <para>, so we can just do it again to get subsequent paras:
618 PARA_ELEMENT,
619 )
620
621PARA_LEVEL_PRECEEDERS = (
622 "setindexsubitem", "author",
623 "stindex", "obindex", "COMMENT", "label", "input", "title",
624 "versionadded", "versionchanged", "declaremodule", "modulesynopsis",
625 "moduleauthor", "indexterm", "leader",
626 )
627
628
629def fixup_paras(doc, fragment):
630 for child in fragment.childNodes:
631 if child.nodeName in RECURSE_INTO_PARA_CONTAINERS:
632 fixup_paras_helper(doc, child)
633 descriptions = find_all_elements(fragment, "description")
634 for description in descriptions:
635 fixup_paras_helper(doc, description)
636
637
638def fixup_paras_helper(doc, container, depth=0):
639 # document is already normalized
640 children = container.childNodes
641 start = skip_leading_nodes(children)
642 while len(children) > start:
643 if children[start].nodeName in RECURSE_INTO_PARA_CONTAINERS:
644 # Something to recurse into:
645 fixup_paras_helper(doc, children[start])
646 else:
647 # Paragraph material:
648 build_para(doc, container, start, len(children))
649 if DEBUG_PARA_FIXER and depth == 10:
650 sys.exit(1)
651 start = skip_leading_nodes(children, start + 1)
652
653
654def build_para(doc, parent, start, i):
655 children = parent.childNodes
656 after = start + 1
657 have_last = 0
658 BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + RECURSE_INTO_PARA_CONTAINERS
659 # Collect all children until \n\n+ is found in a text node or a
660 # member of BREAK_ELEMENTS is found.
661 for j in range(start, i):
662 after = j + 1
663 child = children[j]
664 nodeType = child.nodeType
665 if nodeType == ELEMENT:
666 if child.tagName in BREAK_ELEMENTS:
667 after = j
668 break
669 elif nodeType == TEXT:
670 pos = string.find(child.data, "\n\n")
671 if pos == 0:
672 after = j
673 break
674 if pos >= 1:
675 child.splitText(pos)
676 break
677 else:
678 have_last = 1
679 if (start + 1) > after:
680 raise ConversionError(
681 "build_para() could not identify content to turn into a paragraph")
682 if children[after - 1].nodeType == TEXT:
683 # we may need to split off trailing white space:
684 child = children[after - 1]
685 data = child.data
686 if string.rstrip(data) != data:
687 have_last = 0
688 child.splitText(len(string.rstrip(data)))
689 para = doc.createElement(PARA_ELEMENT)
690 prev = None
691 indexes = range(start, after)
692 indexes.reverse()
693 for j in indexes:
694 node = parent.childNodes[j]
695 parent.removeChild(node)
696 para.insertBefore(node, prev)
697 prev = node
698 if have_last:
699 parent.appendChild(para)
700 parent.appendChild(doc.createTextNode("\n\n"))
701 return len(parent.childNodes)
702 else:
703 nextnode = parent.childNodes[start]
704 if nextnode.nodeType == TEXT:
705 if nextnode.data and nextnode.data[0] != "\n":
706 nextnode.data = "\n" + nextnode.data
707 else:
708 newnode = doc.createTextNode("\n")
709 parent.insertBefore(newnode, nextnode)
710 nextnode = newnode
711 start = start + 1
712 parent.insertBefore(para, nextnode)
713 return start + 1
714
715
716def skip_leading_nodes(children, start=0):
717 """Return index into children of a node at which paragraph building should
718 begin or a recursive call to fixup_paras_helper() should be made (for
719 subsections, etc.).
720
721 When the return value >= len(children), we've built all the paras we can
722 from this list of children.
723 """
724 i = len(children)
725 while i > start:
726 # skip over leading comments and whitespace:
727 child = children[start]
728 nodeType = child.nodeType
729 if nodeType == TEXT:
730 data = child.data
731 shortened = string.lstrip(data)
732 if shortened:
733 if data != shortened:
734 # break into two nodes: whitespace and non-whitespace
735 child.splitText(len(data) - len(shortened))
736 return start + 1
737 return start
738 # all whitespace, just skip
739 elif nodeType == ELEMENT:
740 tagName = child.tagName
741 if tagName in RECURSE_INTO_PARA_CONTAINERS:
742 return start
743 if tagName not in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
744 return start
745 start = start + 1
746 return start
747
748
749def fixup_rfc_references(doc, fragment):
750 for rfcnode in find_all_elements(fragment, "rfc"):
751 rfcnode.appendChild(doc.createTextNode(
752 "RFC " + rfcnode.getAttribute("num")))
753
754
755def fixup_signatures(doc, fragment):
756 for child in fragment.childNodes:
757 if child.nodeType == ELEMENT:
758 args = child.getElementsByTagName("args")
759 for arg in args:
760 fixup_args(doc, arg)
761 arg.normalize()
762 args = child.getElementsByTagName("constructor-args")
763 for arg in args:
764 fixup_args(doc, arg)
765 arg.normalize()
766
767
768def fixup_args(doc, arglist):
769 for child in arglist.childNodes:
770 if child.nodeName == "optional":
771 # found it; fix and return
772 arglist.insertBefore(doc.createTextNode("["), child)
773 optkids = child.childNodes
774 while optkids:
775 k = optkids[0]
776 child.removeChild(k)
777 arglist.insertBefore(k, child)
778 arglist.insertBefore(doc.createTextNode("]"), child)
779 arglist.removeChild(child)
780 return fixup_args(doc, arglist)
781
782
783def fixup_sectionauthors(doc, fragment):
784 for sectauth in find_all_elements(fragment, "sectionauthor"):
785 section = sectauth.parentNode
786 section.removeChild(sectauth)
787 set_tagName(sectauth, "author")
788 sectauth.appendChild(doc.createTextNode(
789 sectauth.getAttribute("name")))
790 sectauth.removeAttribute("name")
791 after = section.childNodes[2]
792 title = section.childNodes[1]
793 if title.nodeName != "title":
794 after = section.childNodes[0]
795 section.insertBefore(doc.createTextNode("\n "), after)
796 section.insertBefore(sectauth, after)
797
798
799def fixup_verbatims(doc):
800 for verbatim in find_all_elements(doc, "verbatim"):
801 child = verbatim.childNodes[0]
802 if child.nodeType == TEXT \
803 and string.lstrip(child.data)[:3] == ">>>":
804 set_tagName(verbatim, "interactive-session")
805
806
807def add_node_ids(fragment, counter=0):
808 fragment.node_id = counter
809 for node in fragment.childNodes:
810 counter = counter + 1
811 if node.nodeType == ELEMENT:
812 counter = add_node_ids(node, counter)
813 else:
814 node.node_id = counter
815 return counter + 1
816
817
818REFMODINDEX_ELEMENTS = ('refmodindex', 'refbimodindex',
819 'refexmodindex', 'refstmodindex')
820
821def fixup_refmodindexes(fragment):
822 # Locate <ref*modindex>...</> co-located with <module>...</>, and
823 # remove the <ref*modindex>, replacing it with index=index on the
824 # <module> element.
825 nodes = find_all_elements_from_set(fragment, REFMODINDEX_ELEMENTS)
826 d = {}
827 for node in nodes:
828 parent = node.parentNode
829 d[parent.node_id] = parent
830 del nodes
831 map(fixup_refmodindexes_chunk, d.values())
832
833
834def fixup_refmodindexes_chunk(container):
835 # node is probably a <para>; let's see how often it isn't:
836 if container.tagName != PARA_ELEMENT:
837 bwrite("--- fixup_refmodindexes_chunk(%s)\n" % container)
838 module_entries = find_all_elements(container, "module")
839 if not module_entries:
840 return
841 index_entries = find_all_elements_from_set(container, REFMODINDEX_ELEMENTS)
842 removes = []
843 for entry in index_entries:
844 children = entry.childNodes
845 if len(children) != 0:
846 bwrite("--- unexpected number of children for %s node:\n"
847 % entry.tagName)
848 ewrite(entry.toxml() + "\n")
849 continue
850 found = 0
851 module_name = entry.getAttribute("module")
852 for node in module_entries:
853 if len(node.childNodes) != 1:
854 continue
855 this_name = node.childNodes[0].data
856 if this_name == module_name:
857 found = 1
858 node.setAttribute("index", "yes")
859 if found:
860 removes.append(entry)
861 for node in removes:
862 container.removeChild(node)
863
864
865def fixup_bifuncindexes(fragment):
866 nodes = find_all_elements(fragment, 'bifuncindex')
867 d = {}
868 # make sure that each parent is only processed once:
869 for node in nodes:
870 parent = node.parentNode
871 d[parent.node_id] = parent
872 del nodes
873 map(fixup_bifuncindexes_chunk, d.values())
874
875
876def fixup_bifuncindexes_chunk(container):
877 removes = []
878 entries = find_all_child_elements(container, "bifuncindex")
879 function_entries = find_all_child_elements(container, "function")
880 for entry in entries:
881 function_name = entry.getAttribute("name")
882 found = 0
883 for func_entry in function_entries:
884 t2 = func_entry.childNodes[0].data
885 if t2[-2:] != "()":
886 continue
887 t2 = t2[:-2]
888 if t2 == function_name:
889 func_entry.setAttribute("index", "yes")
890 func_entry.setAttribute("module", "__builtin__")
891 if not found:
892 found = 1
893 removes.append(entry)
894 for entry in removes:
895 container.removeChild(entry)
896
897
898def join_adjacent_elements(container, gi):
899 queue = [container]
900 while queue:
901 parent = queue.pop()
902 i = 0
903 children = parent.childNodes
904 nchildren = len(children)
905 while i < (nchildren - 1):
906 child = children[i]
907 if child.nodeName == gi:
908 if children[i+1].nodeName == gi:
909 ewrite("--- merging two <%s/> elements\n" % gi)
910 child = children[i]
911 nextchild = children[i+1]
912 nextchildren = nextchild.childNodes
913 while len(nextchildren):
914 node = nextchildren[0]
915 nextchild.removeChild(node)
916 child.appendChild(node)
917 parent.removeChild(nextchild)
918 continue
919 if child.nodeType == ELEMENT:
920 queue.append(child)
921 i = i + 1
922
923
924_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
925
926def write_esis(doc, ofp, knownempty):
927 for node in doc.childNodes:
928 nodeType = node.nodeType
929 if nodeType == ELEMENT:
930 gi = node.tagName
931 if knownempty(gi):
932 if node.hasChildNodes():
933 raise ValueError, \
934 "declared-empty node <%s> has children" % gi
935 ofp.write("e\n")
936 for k, value in node.attributes.items():
937 if _token_rx.match(value):
938 dtype = "TOKEN"
939 else:
940 dtype = "CDATA"
941 ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
942 ofp.write("(%s\n" % gi)
943 write_esis(node, ofp, knownempty)
944 ofp.write(")%s\n" % gi)
945 elif nodeType == TEXT:
946 ofp.write("-%s\n" % esistools.encode(node.data))
947 elif nodeType == ENTITY_REFERENCE:
948 ofp.write("&%s\n" % node.nodeName)
949 else:
950 raise RuntimeError, "unsupported node type: %s" % nodeType
951
952
953def convert(ifp, ofp):
954 events = esistools.parse(ifp)
955 toktype, doc = events.getEvent()
956 fragment = doc.createDocumentFragment()
957 events.expandNode(fragment)
958
959 normalize(fragment)
960 simplify(doc, fragment)
961 handle_labels(doc, fragment)
962 handle_appendix(doc, fragment)
963 fixup_trailing_whitespace(doc, {
964 "abstract": "\n",
965 "title": "",
966 "chapter": "\n\n",
967 "section": "\n\n",
968 "subsection": "\n\n",
969 "subsubsection": "\n\n",
970 "paragraph": "\n\n",
971 "subparagraph": "\n\n",
972 })
973 cleanup_root_text(doc)
974 cleanup_trailing_parens(fragment, ["function", "method", "cfunction"])
975 cleanup_synopses(doc, fragment)
976 fixup_descriptors(doc, fragment)
977 fixup_verbatims(fragment)
978 normalize(fragment)
979 fixup_paras(doc, fragment)
980 fixup_sectionauthors(doc, fragment)
981 fixup_table_structures(doc, fragment)
982 fixup_rfc_references(doc, fragment)
983 fixup_signatures(doc, fragment)
984 add_node_ids(fragment)
985 fixup_refmodindexes(fragment)
986 fixup_bifuncindexes(fragment)
987 # Take care of ugly hacks in the LaTeX markup to avoid LaTeX and
988 # LaTeX2HTML screwing with GNU-style long options (the '--' problem).
989 join_adjacent_elements(fragment, "option")
990 #
991 d = {}
992 for gi in events.parser.get_empties():
993 d[gi] = gi
994 if d.has_key("author"):
995 del d["author"]
996 if d.has_key("rfc"):
997 del d["rfc"]
998 knownempty = d.has_key
999 #
1000 try:
1001 write_esis(fragment, ofp, knownempty)
1002 except IOError, (err, msg):
1003 # Ignore EPIPE; it just means that whoever we're writing to stopped
1004 # reading. The rest of the output would be ignored. All other errors
1005 # should still be reported,
1006 if err != errno.EPIPE:
1007 raise
1008
1009
1010def main():
1011 if len(sys.argv) == 1:
1012 ifp = sys.stdin
1013 ofp = sys.stdout
1014 elif len(sys.argv) == 2:
1015 ifp = open(sys.argv[1])
1016 ofp = sys.stdout
1017 elif len(sys.argv) == 3:
1018 ifp = open(sys.argv[1])
1019 import StringIO
1020 ofp = StringIO.StringIO()
1021 else:
1022 usage()
1023 sys.exit(2)
1024 convert(ifp, ofp)
1025 if len(sys.argv) == 3:
1026 fp = open(sys.argv[2], "w")
1027 fp.write(ofp.getvalue())
1028 fp.close()
1029 ofp.close()
1030
1031
1032if __name__ == "__main__":
1033 main()