| #! /usr/bin/env python | 
 |  | 
 | """Perform massive transformations on a document tree created from the LaTeX | 
 | of the Python documentation, and dump the ESIS data for the transformed tree. | 
 | """ | 
 |  | 
 |  | 
 | import errno | 
 | import esistools | 
 | import re | 
 | import sys | 
 | import xml.dom | 
 | import xml.dom.minidom | 
 |  | 
 | ELEMENT = xml.dom.Node.ELEMENT_NODE | 
 | ENTITY_REFERENCE = xml.dom.Node.ENTITY_REFERENCE_NODE | 
 | TEXT = xml.dom.Node.TEXT_NODE | 
 |  | 
 |  | 
 | class ConversionError(Exception): | 
 |     pass | 
 |  | 
 |  | 
 | ewrite = sys.stderr.write | 
 | try: | 
 |     # We can only do this trick on Unix (if tput is on $PATH)! | 
 |     if sys.platform != "posix" or not sys.stderr.isatty(): | 
 |         raise ImportError | 
 |     import commands | 
 | except ImportError: | 
 |     bwrite = ewrite | 
 | else: | 
 |     def bwrite(s, BOLDON=commands.getoutput("tput bold"), | 
 |                BOLDOFF=commands.getoutput("tput sgr0")): | 
 |         ewrite("%s%s%s" % (BOLDON, s, BOLDOFF)) | 
 |  | 
 |  | 
 | PARA_ELEMENT = "para" | 
 |  | 
 | DEBUG_PARA_FIXER = 0 | 
 |  | 
 | if DEBUG_PARA_FIXER: | 
 |     def para_msg(s): | 
 |         ewrite("*** %s\n" % s) | 
 | else: | 
 |     def para_msg(s): | 
 |         pass | 
 |  | 
 |  | 
 | def get_first_element(doc, gi): | 
 |     for n in doc.childNodes: | 
 |         if n.nodeName == gi: | 
 |             return n | 
 |  | 
 | def extract_first_element(doc, gi): | 
 |     node = get_first_element(doc, gi) | 
 |     if node is not None: | 
 |         doc.removeChild(node) | 
 |     return node | 
 |  | 
 |  | 
 | def get_documentElement(node): | 
 |     result = None | 
 |     for child in node.childNodes: | 
 |         if child.nodeType == ELEMENT: | 
 |             result = child | 
 |     return result | 
 |  | 
 |  | 
 | def set_tagName(elem, gi): | 
 |     elem.nodeName = elem.tagName = gi | 
 |  | 
 |  | 
 | def find_all_elements(doc, gi): | 
 |     nodes = [] | 
 |     if doc.nodeName == gi: | 
 |         nodes.append(doc) | 
 |     for child in doc.childNodes: | 
 |         if child.nodeType == ELEMENT: | 
 |             if child.tagName == gi: | 
 |                 nodes.append(child) | 
 |             for node in child.getElementsByTagName(gi): | 
 |                 nodes.append(node) | 
 |     return nodes | 
 |  | 
 | def find_all_child_elements(doc, gi): | 
 |     nodes = [] | 
 |     for child in doc.childNodes: | 
 |         if child.nodeName == gi: | 
 |             nodes.append(child) | 
 |     return nodes | 
 |  | 
 |  | 
 | def find_all_elements_from_set(doc, gi_set): | 
 |     return __find_all_elements_from_set(doc, gi_set, []) | 
 |  | 
 | def __find_all_elements_from_set(doc, gi_set, nodes): | 
 |     if doc.nodeName in gi_set: | 
 |         nodes.append(doc) | 
 |     for child in doc.childNodes: | 
 |         if child.nodeType == ELEMENT: | 
 |             __find_all_elements_from_set(child, gi_set, nodes) | 
 |     return nodes | 
 |  | 
 |  | 
 | def simplify(doc, fragment): | 
 |     # Try to rationalize the document a bit, since these things are simply | 
 |     # not valid SGML/XML documents as they stand, and need a little work. | 
 |     documentclass = "document" | 
 |     inputs = [] | 
 |     node = extract_first_element(fragment, "documentclass") | 
 |     if node is not None: | 
 |         documentclass = node.getAttribute("classname") | 
 |     node = extract_first_element(fragment, "title") | 
 |     if node is not None: | 
 |         inputs.append(node) | 
 |     # update the name of the root element | 
 |     node = get_first_element(fragment, "document") | 
 |     if node is not None: | 
 |         set_tagName(node, documentclass) | 
 |         # Move everything that comes before this node into this node; | 
 |         # this will be the document element. | 
 |         nodelist = fragment.childNodes | 
 |         point = node.firstChild | 
 |         while not nodelist[0].isSameNode(node): | 
 |             node.insertBefore(nodelist[0], point) | 
 |     while 1: | 
 |         node = extract_first_element(fragment, "input") | 
 |         if node is None: | 
 |             break | 
 |         inputs.append(node) | 
 |     if inputs: | 
 |         docelem = get_documentElement(fragment) | 
 |         inputs.reverse() | 
 |         for node in inputs: | 
 |             text = doc.createTextNode("\n") | 
 |             docelem.insertBefore(text, docelem.firstChild) | 
 |             docelem.insertBefore(node, text) | 
 |         docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild) | 
 |     while fragment.firstChild and fragment.firstChild.nodeType == TEXT: | 
 |         fragment.removeChild(fragment.firstChild) | 
 |  | 
 |  | 
 | def cleanup_root_text(doc): | 
 |     discards = [] | 
 |     skip = 0 | 
 |     for n in doc.childNodes: | 
 |         prevskip = skip | 
 |         skip = 0 | 
 |         if n.nodeType == TEXT and not prevskip: | 
 |             discards.append(n) | 
 |         elif n.nodeName == "COMMENT": | 
 |             skip = 1 | 
 |     for node in discards: | 
 |         doc.removeChild(node) | 
 |  | 
 |  | 
 | DESCRIPTOR_ELEMENTS = ( | 
 |     "cfuncdesc", "cvardesc", "ctypedesc", | 
 |     "classdesc", "memberdesc", "memberdescni", "methoddesc", "methoddescni", | 
 |     "excdesc", "funcdesc", "funcdescni", "opcodedesc", | 
 |     "datadesc", "datadescni", | 
 |     ) | 
 |  | 
 | def fixup_descriptors(doc, fragment): | 
 |     sections = find_all_elements(fragment, "section") | 
 |     for section in sections: | 
 |         find_and_fix_descriptors(doc, section) | 
 |  | 
 |  | 
 | def find_and_fix_descriptors(doc, container): | 
 |     children = container.childNodes | 
 |     for child in children: | 
 |         if child.nodeType == ELEMENT: | 
 |             tagName = child.tagName | 
 |             if tagName in DESCRIPTOR_ELEMENTS: | 
 |                 rewrite_descriptor(doc, child) | 
 |             elif tagName == "subsection": | 
 |                 find_and_fix_descriptors(doc, child) | 
 |  | 
 |  | 
 | def rewrite_descriptor(doc, descriptor): | 
 |     # | 
 |     # Do these things: | 
 |     #   1. Add an "index='no'" attribute to the element if the tagName | 
 |     #      ends in 'ni', removing the 'ni' from the name. | 
 |     #   2. Create a <signature> from the name attribute | 
 |     #   2a.Create an <args> if it appears to be available. | 
 |     #   3. Create additional <signature>s from <*line{,ni}> elements, | 
 |     #      if found. | 
 |     #   4. If a <versionadded> is found, move it to an attribute on the | 
 |     #      descriptor. | 
 |     #   5. Move remaining child nodes to a <description> element. | 
 |     #   6. Put it back together. | 
 |     # | 
 |     # 1. | 
 |     descname = descriptor.tagName | 
 |     index = descriptor.getAttribute("name") != "no" | 
 |     desctype = descname[:-4] # remove 'desc' | 
 |     linename = desctype + "line" | 
 |     if not index: | 
 |         linename = linename + "ni" | 
 |     # 2. | 
 |     signature = doc.createElement("signature") | 
 |     name = doc.createElement("name") | 
 |     signature.appendChild(doc.createTextNode("\n    ")) | 
 |     signature.appendChild(name) | 
 |     name.appendChild(doc.createTextNode(descriptor.getAttribute("name"))) | 
 |     descriptor.removeAttribute("name") | 
 |     # 2a. | 
 |     if descriptor.hasAttribute("var"): | 
 |         if descname != "opcodedesc": | 
 |             raise RuntimeError, \ | 
 |                   "got 'var' attribute on descriptor other than opcodedesc" | 
 |         variable = descriptor.getAttribute("var") | 
 |         if variable: | 
 |             args = doc.createElement("args") | 
 |             args.appendChild(doc.createTextNode(variable)) | 
 |             signature.appendChild(doc.createTextNode("\n    ")) | 
 |             signature.appendChild(args) | 
 |         descriptor.removeAttribute("var") | 
 |     newchildren = [signature] | 
 |     children = descriptor.childNodes | 
 |     pos = skip_leading_nodes(children) | 
 |     if pos < len(children): | 
 |         child = children[pos] | 
 |         if child.nodeName == "args": | 
 |             # move <args> to <signature>, or remove if empty: | 
 |             child.parentNode.removeChild(child) | 
 |             if len(child.childNodes): | 
 |                 signature.appendChild(doc.createTextNode("\n    ")) | 
 |                 signature.appendChild(child) | 
 |     signature.appendChild(doc.createTextNode("\n  ")) | 
 |     # 3, 4. | 
 |     pos = skip_leading_nodes(children, pos) | 
 |     while pos < len(children) \ | 
 |           and children[pos].nodeName in (linename, "versionadded"): | 
 |         if children[pos].tagName == linename: | 
 |             # this is really a supplemental signature, create <signature> | 
 |             oldchild = children[pos].cloneNode(1) | 
 |             try: | 
 |                 sig = methodline_to_signature(doc, children[pos]) | 
 |             except KeyError: | 
 |                 print oldchild.toxml() | 
 |                 raise | 
 |             newchildren.append(sig) | 
 |         else: | 
 |             # <versionadded added=...> | 
 |             descriptor.setAttribute( | 
 |                 "added", children[pos].getAttribute("version")) | 
 |         pos = skip_leading_nodes(children, pos + 1) | 
 |     # 5. | 
 |     description = doc.createElement("description") | 
 |     description.appendChild(doc.createTextNode("\n")) | 
 |     newchildren.append(description) | 
 |     move_children(descriptor, description, pos) | 
 |     last = description.childNodes[-1] | 
 |     if last.nodeType == TEXT: | 
 |         last.data = last.data.rstrip() + "\n  " | 
 |     # 6. | 
 |     # should have nothing but whitespace and signature lines in <descriptor>; | 
 |     # discard them | 
 |     while descriptor.childNodes: | 
 |         descriptor.removeChild(descriptor.childNodes[0]) | 
 |     for node in newchildren: | 
 |         descriptor.appendChild(doc.createTextNode("\n  ")) | 
 |         descriptor.appendChild(node) | 
 |     descriptor.appendChild(doc.createTextNode("\n")) | 
 |  | 
 |  | 
 | def methodline_to_signature(doc, methodline): | 
 |     signature = doc.createElement("signature") | 
 |     signature.appendChild(doc.createTextNode("\n    ")) | 
 |     name = doc.createElement("name") | 
 |     name.appendChild(doc.createTextNode(methodline.getAttribute("name"))) | 
 |     methodline.removeAttribute("name") | 
 |     signature.appendChild(name) | 
 |     if len(methodline.childNodes): | 
 |         args = doc.createElement("args") | 
 |         signature.appendChild(doc.createTextNode("\n    ")) | 
 |         signature.appendChild(args) | 
 |         move_children(methodline, args) | 
 |     signature.appendChild(doc.createTextNode("\n  ")) | 
 |     return signature | 
 |  | 
 |  | 
 | def move_children(origin, dest, start=0): | 
 |     children = origin.childNodes | 
 |     while start < len(children): | 
 |         node = children[start] | 
 |         origin.removeChild(node) | 
 |         dest.appendChild(node) | 
 |  | 
 |  | 
 | def handle_appendix(doc, fragment): | 
 |     # must be called after simplfy() if document is multi-rooted to begin with | 
 |     docelem = get_documentElement(fragment) | 
 |     toplevel = docelem.tagName == "manual" and "chapter" or "section" | 
 |     appendices = 0 | 
 |     nodes = [] | 
 |     for node in docelem.childNodes: | 
 |         if appendices: | 
 |             nodes.append(node) | 
 |         elif node.nodeType == ELEMENT: | 
 |             appnodes = node.getElementsByTagName("appendix") | 
 |             if appnodes: | 
 |                 appendices = 1 | 
 |                 parent = appnodes[0].parentNode | 
 |                 parent.removeChild(appnodes[0]) | 
 |                 parent.normalize() | 
 |     if nodes: | 
 |         map(docelem.removeChild, nodes) | 
 |         docelem.appendChild(doc.createTextNode("\n\n\n")) | 
 |         back = doc.createElement("back-matter") | 
 |         docelem.appendChild(back) | 
 |         back.appendChild(doc.createTextNode("\n")) | 
 |         while nodes and nodes[0].nodeType == TEXT \ | 
 |               and not nodes[0].data.strip(): | 
 |             del nodes[0] | 
 |         map(back.appendChild, nodes) | 
 |         docelem.appendChild(doc.createTextNode("\n")) | 
 |  | 
 |  | 
 | def handle_labels(doc, fragment): | 
 |     for label in find_all_elements(fragment, "label"): | 
 |         id = label.getAttribute("id") | 
 |         if not id: | 
 |             continue | 
 |         parent = label.parentNode | 
 |         parentTagName = parent.tagName | 
 |         if parentTagName == "title": | 
 |             parent.parentNode.setAttribute("id", id) | 
 |         else: | 
 |             parent.setAttribute("id", id) | 
 |         # now, remove <label id="..."/> from parent: | 
 |         parent.removeChild(label) | 
 |         if parentTagName == "title": | 
 |             parent.normalize() | 
 |             children = parent.childNodes | 
 |             if children[-1].nodeType == TEXT: | 
 |                 children[-1].data = children[-1].data.rstrip() | 
 |  | 
 |  | 
 | def fixup_trailing_whitespace(doc, fragment, wsmap): | 
 |     queue = [fragment] | 
 |     fixups = [] | 
 |     while queue: | 
 |         node = queue[0] | 
 |         del queue[0] | 
 |         if wsmap.has_key(node.nodeName): | 
 |             fixups.append(node) | 
 |         for child in node.childNodes: | 
 |             if child.nodeType == ELEMENT: | 
 |                 queue.append(child) | 
 |  | 
 |     # reverse the list to process from the inside out | 
 |     fixups.reverse() | 
 |     for node in fixups: | 
 |         node.parentNode.normalize() | 
 |         lastchild = node.lastChild | 
 |         before, after = wsmap[node.tagName] | 
 |         if lastchild.nodeType == TEXT: | 
 |             data = lastchild.data.rstrip() + before | 
 |             lastchild.data = data | 
 |         norm = 0 | 
 |         if wsmap[node.tagName]: | 
 |             nextnode = node.nextSibling | 
 |             if nextnode and nextnode.nodeType == TEXT: | 
 |                 nextnode.data = after + nextnode.data.lstrip() | 
 |             else: | 
 |                 wsnode = doc.createTextNode(after) | 
 |                 node.parentNode.insertBefore(wsnode, nextnode) | 
 |         # hack to get the title in place: | 
 |         if node.tagName == "title" \ | 
 |            and node.parentNode.firstChild.nodeType == ELEMENT: | 
 |             node.parentNode.insertBefore(doc.createTextNode("\n  "), | 
 |                                          node.parentNode.firstChild) | 
 |             node.parentNode.normalize() | 
 |  | 
 |  | 
 | def normalize(doc): | 
 |     for node in doc.childNodes: | 
 |         if node.nodeType == ELEMENT: | 
 |             node.normalize() | 
 |  | 
 |  | 
 | def cleanup_trailing_parens(doc, element_names): | 
 |     d = {} | 
 |     for gi in element_names: | 
 |         d[gi] = gi | 
 |     rewrite_element = d.has_key | 
 |     queue = [node for node in doc.childNodes if node.nodeType == ELEMENT] | 
 |     while queue: | 
 |         node = queue[0] | 
 |         del queue[0] | 
 |         if rewrite_element(node.tagName): | 
 |             lastchild = node.lastChild | 
 |             if lastchild and lastchild.nodeType == TEXT: | 
 |                 data = lastchild.data | 
 |                 if data.endswith("()"): | 
 |                     lastchild.data = data[:-2] | 
 |         else: | 
 |             for child in node.childNodes: | 
 |                 if child.nodeType == ELEMENT: | 
 |                     queue.append(child) | 
 |  | 
 |  | 
 | def contents_match(left, right): | 
 |     left_children = left.childNodes | 
 |     right_children = right.childNodes | 
 |     if len(left_children) != len(right_children): | 
 |         return 0 | 
 |     for l, r in map(None, left_children, right_children): | 
 |         nodeType = l.nodeType | 
 |         if nodeType != r.nodeType: | 
 |             return 0 | 
 |         if nodeType == ELEMENT: | 
 |             if l.tagName != r.tagName: | 
 |                 return 0 | 
 |             # should check attributes, but that's not a problem here | 
 |             if not contents_match(l, r): | 
 |                 return 0 | 
 |         elif nodeType == TEXT: | 
 |             if l.data != r.data: | 
 |                 return 0 | 
 |         else: | 
 |             # not quite right, but good enough | 
 |             return 0 | 
 |     return 1 | 
 |  | 
 |  | 
 | def create_module_info(doc, section): | 
 |     # Heavy. | 
 |     node = extract_first_element(section, "modulesynopsis") | 
 |     if node is None: | 
 |         return | 
 |     set_tagName(node, "synopsis") | 
 |     lastchild = node.childNodes[-1] | 
 |     if lastchild.nodeType == TEXT \ | 
 |        and lastchild.data[-1:] == ".": | 
 |         lastchild.data = lastchild.data[:-1] | 
 |     modauthor = extract_first_element(section, "moduleauthor") | 
 |     if modauthor: | 
 |         set_tagName(modauthor, "author") | 
 |         modauthor.appendChild(doc.createTextNode( | 
 |             modauthor.getAttribute("name"))) | 
 |         modauthor.removeAttribute("name") | 
 |     platform = extract_first_element(section, "platform") | 
 |     if section.tagName == "section": | 
 |         modinfo_pos = 2 | 
 |         modinfo = doc.createElement("moduleinfo") | 
 |         moddecl = extract_first_element(section, "declaremodule") | 
 |         name = None | 
 |         if moddecl: | 
 |             modinfo.appendChild(doc.createTextNode("\n    ")) | 
 |             name = moddecl.attributes["name"].value | 
 |             namenode = doc.createElement("name") | 
 |             namenode.appendChild(doc.createTextNode(name)) | 
 |             modinfo.appendChild(namenode) | 
 |             type = moddecl.attributes.get("type") | 
 |             if type: | 
 |                 type = type.value | 
 |                 modinfo.appendChild(doc.createTextNode("\n    ")) | 
 |                 typenode = doc.createElement("type") | 
 |                 typenode.appendChild(doc.createTextNode(type)) | 
 |                 modinfo.appendChild(typenode) | 
 |         versionadded = extract_first_element(section, "versionadded") | 
 |         if versionadded: | 
 |             modinfo.setAttribute("added", versionadded.getAttribute("version")) | 
 |         title = get_first_element(section, "title") | 
 |         if title: | 
 |             children = title.childNodes | 
 |             if len(children) >= 2 \ | 
 |                and children[0].nodeName == "module" \ | 
 |                and children[0].childNodes[0].data == name: | 
 |                 # this is it; morph the <title> into <short-synopsis> | 
 |                 first_data = children[1] | 
 |                 if first_data.data[:4] == " ---": | 
 |                     first_data.data = first_data.data[4:].lstrip() | 
 |                 set_tagName(title, "short-synopsis") | 
 |                 if children[-1].nodeType == TEXT \ | 
 |                    and children[-1].data[-1:] == ".": | 
 |                     children[-1].data = children[-1].data[:-1] | 
 |                 section.removeChild(title) | 
 |                 section.removeChild(section.childNodes[0]) | 
 |                 title.removeChild(children[0]) | 
 |                 modinfo_pos = 0 | 
 |             else: | 
 |                 ewrite("module name in title doesn't match" | 
 |                        " <declaremodule/>; no <short-synopsis/>\n") | 
 |         else: | 
 |             ewrite("Unexpected condition: <section/> without <title/>\n") | 
 |         modinfo.appendChild(doc.createTextNode("\n    ")) | 
 |         modinfo.appendChild(node) | 
 |         if title and not contents_match(title, node): | 
 |             # The short synopsis is actually different, | 
 |             # and needs to be stored: | 
 |             modinfo.appendChild(doc.createTextNode("\n    ")) | 
 |             modinfo.appendChild(title) | 
 |         if modauthor: | 
 |             modinfo.appendChild(doc.createTextNode("\n    ")) | 
 |             modinfo.appendChild(modauthor) | 
 |         if platform: | 
 |             modinfo.appendChild(doc.createTextNode("\n    ")) | 
 |             modinfo.appendChild(platform) | 
 |         modinfo.appendChild(doc.createTextNode("\n  ")) | 
 |         section.insertBefore(modinfo, section.childNodes[modinfo_pos]) | 
 |         section.insertBefore(doc.createTextNode("\n  "), modinfo) | 
 |         # | 
 |         # The rest of this removes extra newlines from where we cut out | 
 |         # a lot of elements.  A lot of code for minimal value, but keeps | 
 |         # keeps the generated *ML from being too funny looking. | 
 |         # | 
 |         section.normalize() | 
 |         children = section.childNodes | 
 |         for i in range(len(children)): | 
 |             node = children[i] | 
 |             if node.nodeName == "moduleinfo": | 
 |                 nextnode = children[i+1] | 
 |                 if nextnode.nodeType == TEXT: | 
 |                     data = nextnode.data | 
 |                     s = data.lstrip() | 
 |                     if len(s) < (len(data) - 4): | 
 |                         nextnode.data = "\n\n\n" + s | 
 |  | 
 |  | 
 | def cleanup_synopses(doc, fragment): | 
 |     for node in find_all_elements(fragment, "section"): | 
 |         create_module_info(doc, node) | 
 |  | 
 |  | 
 | def fixup_table_structures(doc, fragment): | 
 |     for table in find_all_elements(fragment, "table"): | 
 |         fixup_table(doc, table) | 
 |  | 
 |  | 
 | def fixup_table(doc, table): | 
 |     # create the table head | 
 |     thead = doc.createElement("thead") | 
 |     row = doc.createElement("row") | 
 |     move_elements_by_name(doc, table, row, "entry") | 
 |     thead.appendChild(doc.createTextNode("\n    ")) | 
 |     thead.appendChild(row) | 
 |     thead.appendChild(doc.createTextNode("\n    ")) | 
 |     # create the table body | 
 |     tbody = doc.createElement("tbody") | 
 |     prev_row = None | 
 |     last_was_hline = 0 | 
 |     children = table.childNodes | 
 |     for child in children: | 
 |         if child.nodeType == ELEMENT: | 
 |             tagName = child.tagName | 
 |             if tagName == "hline" and prev_row is not None: | 
 |                 prev_row.setAttribute("rowsep", "1") | 
 |             elif tagName == "row": | 
 |                 prev_row = child | 
 |     # save the rows: | 
 |     tbody.appendChild(doc.createTextNode("\n    ")) | 
 |     move_elements_by_name(doc, table, tbody, "row", sep="\n    ") | 
 |     # and toss the rest: | 
 |     while children: | 
 |         child = children[0] | 
 |         nodeType = child.nodeType | 
 |         if nodeType == TEXT: | 
 |             if child.data.strip(): | 
 |                 raise ConversionError("unexpected free data in <%s>: %r" | 
 |                                       % (table.tagName, child.data)) | 
 |             table.removeChild(child) | 
 |             continue | 
 |         if nodeType == ELEMENT: | 
 |             if child.tagName != "hline": | 
 |                 raise ConversionError( | 
 |                     "unexpected <%s> in table" % child.tagName) | 
 |             table.removeChild(child) | 
 |             continue | 
 |         raise ConversionError( | 
 |             "unexpected %s node in table" % child.__class__.__name__) | 
 |     # nothing left in the <table>; add the <thead> and <tbody> | 
 |     tgroup = doc.createElement("tgroup") | 
 |     tgroup.appendChild(doc.createTextNode("\n  ")) | 
 |     tgroup.appendChild(thead) | 
 |     tgroup.appendChild(doc.createTextNode("\n  ")) | 
 |     tgroup.appendChild(tbody) | 
 |     tgroup.appendChild(doc.createTextNode("\n  ")) | 
 |     table.appendChild(tgroup) | 
 |     # now make the <entry>s look nice: | 
 |     for row in table.getElementsByTagName("row"): | 
 |         fixup_row(doc, row) | 
 |  | 
 |  | 
 | def fixup_row(doc, row): | 
 |     entries = [] | 
 |     map(entries.append, row.childNodes[1:]) | 
 |     for entry in entries: | 
 |         row.insertBefore(doc.createTextNode("\n         "), entry) | 
 | #    row.appendChild(doc.createTextNode("\n      ")) | 
 |  | 
 |  | 
 | def move_elements_by_name(doc, source, dest, name, sep=None): | 
 |     nodes = [] | 
 |     for child in source.childNodes: | 
 |         if child.nodeName == name: | 
 |             nodes.append(child) | 
 |     for node in nodes: | 
 |         source.removeChild(node) | 
 |         dest.appendChild(node) | 
 |         if sep: | 
 |             dest.appendChild(doc.createTextNode(sep)) | 
 |  | 
 |  | 
 | RECURSE_INTO_PARA_CONTAINERS = ( | 
 |     "chapter", "abstract", "enumerate", | 
 |     "section", "subsection", "subsubsection", | 
 |     "paragraph", "subparagraph", "back-matter", | 
 |     "howto", "manual", | 
 |     "item", "itemize", "fulllineitems", "enumeration", "descriptionlist", | 
 |     "definitionlist", "definition", | 
 |     ) | 
 |  | 
 | PARA_LEVEL_ELEMENTS = ( | 
 |     "moduleinfo", "title", "verbatim", "enumerate", "item", | 
 |     "interpreter-session", "back-matter", "interactive-session", | 
 |     "opcodedesc", "classdesc", "datadesc", | 
 |     "cfuncdesc", "ctypedesc", "cvardesc", | 
 |     "funcdesc", "methoddesc", "excdesc", "memberdesc", "membderdescni", | 
 |     "funcdescni", "methoddescni", "excdescni", | 
 |     "tableii", "tableiii", "tableiv", "localmoduletable", | 
 |     "sectionauthor", "seealso", "itemize", | 
 |     # include <para>, so we can just do it again to get subsequent paras: | 
 |     PARA_ELEMENT, | 
 |     ) | 
 |  | 
 | PARA_LEVEL_PRECEEDERS = ( | 
 |     "setindexsubitem", "author", | 
 |     "stindex", "obindex", "COMMENT", "label", "xi:include", "title", | 
 |     "versionadded", "versionchanged", "declaremodule", "modulesynopsis", | 
 |     "moduleauthor", "indexterm", "leader", | 
 |     ) | 
 |  | 
 |  | 
 | def fixup_paras(doc, fragment): | 
 |     for child in fragment.childNodes: | 
 |         if child.nodeName in RECURSE_INTO_PARA_CONTAINERS: | 
 |             fixup_paras_helper(doc, child) | 
 |     descriptions = find_all_elements(fragment, "description") | 
 |     for description in descriptions: | 
 |         fixup_paras_helper(doc, description) | 
 |  | 
 |  | 
 | def fixup_paras_helper(doc, container, depth=0): | 
 |     # document is already normalized | 
 |     children = container.childNodes | 
 |     start = skip_leading_nodes(children) | 
 |     while len(children) > start: | 
 |         if children[start].nodeName in RECURSE_INTO_PARA_CONTAINERS: | 
 |             # Something to recurse into: | 
 |             fixup_paras_helper(doc, children[start]) | 
 |         else: | 
 |             # Paragraph material: | 
 |             build_para(doc, container, start, len(children)) | 
 |             if DEBUG_PARA_FIXER and depth == 10: | 
 |                 sys.exit(1) | 
 |         start = skip_leading_nodes(children, start + 1) | 
 |  | 
 |  | 
 | def build_para(doc, parent, start, i): | 
 |     children = parent.childNodes | 
 |     after = start + 1 | 
 |     have_last = 0 | 
 |     BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + RECURSE_INTO_PARA_CONTAINERS | 
 |     # Collect all children until \n\n+ is found in a text node or a | 
 |     # member of BREAK_ELEMENTS is found. | 
 |     for j in range(start, i): | 
 |         after = j + 1 | 
 |         child = children[j] | 
 |         nodeType = child.nodeType | 
 |         if nodeType == ELEMENT: | 
 |             if child.tagName in BREAK_ELEMENTS: | 
 |                 after = j | 
 |                 break | 
 |         elif nodeType == TEXT: | 
 |             pos = child.data.find("\n\n") | 
 |             if pos == 0: | 
 |                 after = j | 
 |                 break | 
 |             if pos >= 1: | 
 |                 child.splitText(pos) | 
 |                 break | 
 |     else: | 
 |         have_last = 1 | 
 |     if (start + 1) > after: | 
 |         raise ConversionError( | 
 |             "build_para() could not identify content to turn into a paragraph") | 
 |     if children[after - 1].nodeType == TEXT: | 
 |         # we may need to split off trailing white space: | 
 |         child = children[after - 1] | 
 |         data = child.data | 
 |         if data.rstrip() != data: | 
 |             have_last = 0 | 
 |             child.splitText(len(data.rstrip())) | 
 |     para = doc.createElement(PARA_ELEMENT) | 
 |     prev = None | 
 |     indexes = range(start, after) | 
 |     indexes.reverse() | 
 |     for j in indexes: | 
 |         node = parent.childNodes[j] | 
 |         parent.removeChild(node) | 
 |         para.insertBefore(node, prev) | 
 |         prev = node | 
 |     if have_last: | 
 |         parent.appendChild(para) | 
 |         parent.appendChild(doc.createTextNode("\n\n")) | 
 |         return len(parent.childNodes) | 
 |     else: | 
 |         nextnode = parent.childNodes[start] | 
 |         if nextnode.nodeType == TEXT: | 
 |             if nextnode.data and nextnode.data[0] != "\n": | 
 |                 nextnode.data = "\n" + nextnode.data | 
 |         else: | 
 |             newnode = doc.createTextNode("\n") | 
 |             parent.insertBefore(newnode, nextnode) | 
 |             nextnode = newnode | 
 |             start = start + 1 | 
 |         parent.insertBefore(para, nextnode) | 
 |         return start + 1 | 
 |  | 
 |  | 
 | def skip_leading_nodes(children, start=0): | 
 |     """Return index into children of a node at which paragraph building should | 
 |     begin or a recursive call to fixup_paras_helper() should be made (for | 
 |     subsections, etc.). | 
 |  | 
 |     When the return value >= len(children), we've built all the paras we can | 
 |     from this list of children. | 
 |     """ | 
 |     i = len(children) | 
 |     while i > start: | 
 |         # skip over leading comments and whitespace: | 
 |         child = children[start] | 
 |         nodeType = child.nodeType | 
 |         if nodeType == TEXT: | 
 |             data = child.data | 
 |             shortened = data.lstrip() | 
 |             if shortened: | 
 |                 if data != shortened: | 
 |                     # break into two nodes: whitespace and non-whitespace | 
 |                     child.splitText(len(data) - len(shortened)) | 
 |                     return start + 1 | 
 |                 return start | 
 |             # all whitespace, just skip | 
 |         elif nodeType == ELEMENT: | 
 |             tagName = child.tagName | 
 |             if tagName in RECURSE_INTO_PARA_CONTAINERS: | 
 |                 return start | 
 |             if tagName not in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS: | 
 |                 return start | 
 |         start = start + 1 | 
 |     return start | 
 |  | 
 |  | 
 | def fixup_rfc_references(doc, fragment): | 
 |     for rfcnode in find_all_elements_from_set(fragment, ("pep", "rfc")): | 
 |         rfcnode.appendChild(doc.createTextNode( | 
 |             rfcnode.tagName.upper() + " " + rfcnode.getAttribute("num"))) | 
 |  | 
 |  | 
 | def fixup_signatures(doc, fragment): | 
 |     for child in fragment.childNodes: | 
 |         if child.nodeType == ELEMENT: | 
 |             args = child.getElementsByTagName("args") | 
 |             for arg in args: | 
 |                 rewrite_args(doc, arg) | 
 |             args = child.getElementsByTagName("constructor-args") | 
 |             for arg in args: | 
 |                 rewrite_args(doc, arg) | 
 |  | 
 | def rewrite_args(doc, arglist): | 
 |     fixup_args(doc, arglist) | 
 |     arglist.normalize() | 
 |     if arglist.childNodes.length == 1 and arglist.firstChild.nodeType == TEXT: | 
 |         node = arglist.firstChild | 
 |         node.data = ' '.join(node.data.split()) | 
 |  | 
 | def fixup_args(doc, arglist): | 
 |     for child in arglist.childNodes: | 
 |         if child.nodeName == "optional": | 
 |             # found it; fix and return | 
 |             arglist.insertBefore(doc.createTextNode("["), child) | 
 |             optkids = child.childNodes | 
 |             while optkids: | 
 |                 arglist.insertBefore(child.firstChild, child) | 
 |             arglist.insertBefore(doc.createTextNode("]"), child) | 
 |             arglist.removeChild(child) | 
 |             return fixup_args(doc, arglist) | 
 |  | 
 |  | 
 | def fixup_sectionauthors(doc, fragment): | 
 |     for sectauth in find_all_elements(fragment, "sectionauthor"): | 
 |         section = sectauth.parentNode | 
 |         section.removeChild(sectauth) | 
 |         set_tagName(sectauth, "author") | 
 |         sectauth.appendChild(doc.createTextNode( | 
 |             sectauth.getAttribute("name"))) | 
 |         sectauth.removeAttribute("name") | 
 |         after = section.childNodes[2] | 
 |         title = section.childNodes[1] | 
 |         if title.nodeName != "title": | 
 |             after = section.childNodes[0] | 
 |         section.insertBefore(doc.createTextNode("\n  "), after) | 
 |         section.insertBefore(sectauth, after) | 
 |  | 
 |  | 
 | def fixup_verbatims(doc): | 
 |     for verbatim in find_all_elements(doc, "verbatim"): | 
 |         child = verbatim.childNodes[0] | 
 |         if child.nodeType == TEXT \ | 
 |            and child.data.lstrip().startswith(">>>"): | 
 |             set_tagName(verbatim, "interactive-session") | 
 |  | 
 |  | 
 | def add_node_ids(fragment, counter=0): | 
 |     fragment.node_id = counter | 
 |     for node in fragment.childNodes: | 
 |         counter = counter + 1 | 
 |         if node.nodeType == ELEMENT: | 
 |             counter = add_node_ids(node, counter) | 
 |         else: | 
 |             node.node_id = counter | 
 |     return counter + 1 | 
 |  | 
 |  | 
 | def fixup_ulink(doc, fragment): | 
 |     for ulink in find_all_elements(fragment, "ulink"): | 
 |         children = ulink.childNodes | 
 |         assert len(children) == 2 | 
 |         text = children[0] | 
 |         href = children[1] | 
 |         href.normalize() | 
 |         assert len(href.childNodes) == 1 | 
 |         assert href.childNodes[0].nodeType == TEXT | 
 |         url = href.childNodes[0].data | 
 |         ulink.setAttribute("href", url) | 
 |         ulink.removeChild(href) | 
 |         content = text.childNodes | 
 |         while len(content): | 
 |             ulink.appendChild(content[0]) | 
 |         ulink.removeChild(text) | 
 |  | 
 |  | 
 | REFMODINDEX_ELEMENTS = ('refmodindex', 'refbimodindex', | 
 |                         'refexmodindex', 'refstmodindex') | 
 |  | 
 | def fixup_refmodindexes(fragment): | 
 |     # Locate <ref*modindex>...</> co-located with <module>...</>, and | 
 |     # remove the <ref*modindex>, replacing it with index=index on the | 
 |     # <module> element. | 
 |     nodes = find_all_elements_from_set(fragment, REFMODINDEX_ELEMENTS) | 
 |     d = {} | 
 |     for node in nodes: | 
 |         parent = node.parentNode | 
 |         d[parent.node_id] = parent | 
 |     del nodes | 
 |     map(fixup_refmodindexes_chunk, d.values()) | 
 |  | 
 |  | 
 | def fixup_refmodindexes_chunk(container): | 
 |     # node is probably a <para>; let's see how often it isn't: | 
 |     if container.tagName != PARA_ELEMENT: | 
 |         bwrite("--- fixup_refmodindexes_chunk(%s)\n" % container) | 
 |     module_entries = find_all_elements(container, "module") | 
 |     if not module_entries: | 
 |         return | 
 |     index_entries = find_all_elements_from_set(container, REFMODINDEX_ELEMENTS) | 
 |     removes = [] | 
 |     for entry in index_entries: | 
 |         children = entry.childNodes | 
 |         if len(children) != 0: | 
 |             bwrite("--- unexpected number of children for %s node:\n" | 
 |                    % entry.tagName) | 
 |             ewrite(entry.toxml() + "\n") | 
 |             continue | 
 |         found = 0 | 
 |         module_name = entry.getAttribute("module") | 
 |         for node in module_entries: | 
 |             if len(node.childNodes) != 1: | 
 |                 continue | 
 |             this_name = node.childNodes[0].data | 
 |             if this_name == module_name: | 
 |                 found = 1 | 
 |                 node.setAttribute("index", "yes") | 
 |         if found: | 
 |             removes.append(entry) | 
 |     for node in removes: | 
 |         container.removeChild(node) | 
 |  | 
 |  | 
 | def fixup_bifuncindexes(fragment): | 
 |     nodes = find_all_elements(fragment, 'bifuncindex') | 
 |     d = {} | 
 |     # make sure that each parent is only processed once: | 
 |     for node in nodes: | 
 |         parent = node.parentNode | 
 |         d[parent.node_id] = parent | 
 |     del nodes | 
 |     map(fixup_bifuncindexes_chunk, d.values()) | 
 |  | 
 |  | 
 | def fixup_bifuncindexes_chunk(container): | 
 |     removes = [] | 
 |     entries = find_all_child_elements(container, "bifuncindex") | 
 |     function_entries = find_all_child_elements(container, "function") | 
 |     for entry in entries: | 
 |         function_name = entry.getAttribute("name") | 
 |         found = 0 | 
 |         for func_entry in function_entries: | 
 |             t2 = func_entry.childNodes[0].data | 
 |             if t2[-2:] != "()": | 
 |                 continue | 
 |             t2 = t2[:-2] | 
 |             if t2 == function_name: | 
 |                 func_entry.setAttribute("index", "yes") | 
 |                 func_entry.setAttribute("module", "__builtin__") | 
 |                 if not found: | 
 |                     found = 1 | 
 |                     removes.append(entry) | 
 |     for entry in removes: | 
 |         container.removeChild(entry) | 
 |  | 
 |  | 
 | def join_adjacent_elements(container, gi): | 
 |     queue = [container] | 
 |     while queue: | 
 |         parent = queue.pop() | 
 |         i = 0 | 
 |         children = parent.childNodes | 
 |         nchildren = len(children) | 
 |         while i < (nchildren - 1): | 
 |             child = children[i] | 
 |             if child.nodeName == gi: | 
 |                 if children[i+1].nodeName == gi: | 
 |                     ewrite("--- merging two <%s/> elements\n" % gi) | 
 |                     child = children[i] | 
 |                     nextchild = children[i+1] | 
 |                     nextchildren = nextchild.childNodes | 
 |                     while len(nextchildren): | 
 |                         node = nextchildren[0] | 
 |                         nextchild.removeChild(node) | 
 |                         child.appendChild(node) | 
 |                     parent.removeChild(nextchild) | 
 |                     continue | 
 |             if child.nodeType == ELEMENT: | 
 |                 queue.append(child) | 
 |             i = i + 1 | 
 |  | 
 |  | 
 | _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$") | 
 |  | 
 | def write_esis(doc, ofp, knownempty): | 
 |     for node in doc.childNodes: | 
 |         nodeType = node.nodeType | 
 |         if nodeType == ELEMENT: | 
 |             gi = node.tagName | 
 |             if knownempty(gi): | 
 |                 if node.hasChildNodes(): | 
 |                     raise ValueError, \ | 
 |                           "declared-empty node <%s> has children" % gi | 
 |                 ofp.write("e\n") | 
 |             for k, value in node.attributes.items(): | 
 |                 if _token_rx.match(value): | 
 |                     dtype = "TOKEN" | 
 |                 else: | 
 |                     dtype = "CDATA" | 
 |                 ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value))) | 
 |             ofp.write("(%s\n" % gi) | 
 |             write_esis(node, ofp, knownempty) | 
 |             ofp.write(")%s\n" % gi) | 
 |         elif nodeType == TEXT: | 
 |             ofp.write("-%s\n" % esistools.encode(node.data)) | 
 |         elif nodeType == ENTITY_REFERENCE: | 
 |             ofp.write("&%s\n" % node.nodeName) | 
 |         else: | 
 |             raise RuntimeError, "unsupported node type: %s" % nodeType | 
 |  | 
 |  | 
 | def convert(ifp, ofp): | 
 |     events = esistools.parse(ifp) | 
 |     toktype, doc = events.getEvent() | 
 |     fragment = doc.createDocumentFragment() | 
 |     events.expandNode(fragment) | 
 |  | 
 |     normalize(fragment) | 
 |     simplify(doc, fragment) | 
 |     handle_labels(doc, fragment) | 
 |     handle_appendix(doc, fragment) | 
 |     fixup_trailing_whitespace(doc, fragment, { | 
 |         # element -> (before-end-tag, after-end-tag) | 
 |         "abstract": ("\n", "\n"), | 
 |         "title": ("", "\n"), | 
 |         "chapter": ("\n", "\n\n\n"), | 
 |         "section": ("\n", "\n\n\n"), | 
 |         "subsection": ("\n", "\n\n"), | 
 |         "subsubsection": ("\n", "\n\n"), | 
 |         "paragraph": ("\n", "\n\n"), | 
 |         "subparagraph": ("\n", "\n\n"), | 
 |         "description": ("\n", "\n\n"), | 
 |         "enumeration": ("\n", "\n\n"), | 
 |         "item": ("\n", "\n\n"), | 
 |         }) | 
 |     cleanup_root_text(doc) | 
 |     cleanup_trailing_parens(fragment, ["function", "method", "cfunction"]) | 
 |     cleanup_synopses(doc, fragment) | 
 |     fixup_descriptors(doc, fragment) | 
 |     fixup_verbatims(fragment) | 
 |     normalize(fragment) | 
 |     fixup_paras(doc, fragment) | 
 |     fixup_sectionauthors(doc, fragment) | 
 |     fixup_table_structures(doc, fragment) | 
 |     fixup_rfc_references(doc, fragment) | 
 |     fixup_signatures(doc, fragment) | 
 |     fixup_ulink(doc, fragment) | 
 |     add_node_ids(fragment) | 
 |     fixup_refmodindexes(fragment) | 
 |     fixup_bifuncindexes(fragment) | 
 |     # Take care of ugly hacks in the LaTeX markup to avoid LaTeX and | 
 |     # LaTeX2HTML screwing with GNU-style long options (the '--' problem). | 
 |     join_adjacent_elements(fragment, "option") | 
 |     # Attempt to avoid trailing blank lines: | 
 |     fragment.normalize() | 
 |     if fragment.lastChild.data[-1:] == "\n": | 
 |         fragment.lastChild.data = fragment.lastChild.data.rstrip() + "\n" | 
 |     # | 
 |     d = {} | 
 |     for gi in events.parser.get_empties(): | 
 |         d[gi] = gi | 
 |     for key in ("author", "pep", "rfc"): | 
 |         if d.has_key(key): | 
 |             del d[key] | 
 |     knownempty = d.has_key | 
 |     # | 
 |     try: | 
 |         write_esis(fragment, ofp, knownempty) | 
 |     except IOError, (err, msg): | 
 |         # Ignore EPIPE; it just means that whoever we're writing to stopped | 
 |         # reading.  The rest of the output would be ignored.  All other errors | 
 |         # should still be reported, | 
 |         if err != errno.EPIPE: | 
 |             raise | 
 |  | 
 |  | 
 | def main(): | 
 |     if len(sys.argv) == 1: | 
 |         ifp = sys.stdin | 
 |         ofp = sys.stdout | 
 |     elif len(sys.argv) == 2: | 
 |         ifp = open(sys.argv[1]) | 
 |         ofp = sys.stdout | 
 |     elif len(sys.argv) == 3: | 
 |         ifp = open(sys.argv[1]) | 
 |         import StringIO | 
 |         ofp = StringIO.StringIO() | 
 |     else: | 
 |         usage() | 
 |         sys.exit(2) | 
 |     convert(ifp, ofp) | 
 |     if len(sys.argv) == 3: | 
 |         fp = open(sys.argv[2], "w") | 
 |         fp.write(ofp.getvalue()) | 
 |         fp.close() | 
 |         ofp.close() | 
 |  | 
 |  | 
 | if __name__ == "__main__": | 
 |     main() |