| #! /usr/bin/env python |
| |
| """Perform massive transformations on a document tree created from the LaTeX |
| of the Python documentation, and dump the ESIS data for the transformed tree. |
| """ |
| |
| |
| import errno |
| import esistools |
| import re |
| import sys |
| import xml.dom |
| import xml.dom.minidom |
| |
| ELEMENT = xml.dom.Node.ELEMENT_NODE |
| ENTITY_REFERENCE = xml.dom.Node.ENTITY_REFERENCE_NODE |
| TEXT = xml.dom.Node.TEXT_NODE |
| |
| |
| class ConversionError(Exception): |
| pass |
| |
| |
| ewrite = sys.stderr.write |
| try: |
| # We can only do this trick on Unix (if tput is on $PATH)! |
| if sys.platform != "posix" or not sys.stderr.isatty(): |
| raise ImportError |
| import commands |
| except ImportError: |
| bwrite = ewrite |
| else: |
| def bwrite(s, BOLDON=commands.getoutput("tput bold"), |
| BOLDOFF=commands.getoutput("tput sgr0")): |
| ewrite("%s%s%s" % (BOLDON, s, BOLDOFF)) |
| |
| |
| PARA_ELEMENT = "para" |
| |
| DEBUG_PARA_FIXER = 0 |
| |
| if DEBUG_PARA_FIXER: |
| def para_msg(s): |
| ewrite("*** %s\n" % s) |
| else: |
| def para_msg(s): |
| pass |
| |
| |
| def get_first_element(doc, gi): |
| for n in doc.childNodes: |
| if n.nodeName == gi: |
| return n |
| |
| def extract_first_element(doc, gi): |
| node = get_first_element(doc, gi) |
| if node is not None: |
| doc.removeChild(node) |
| return node |
| |
| |
| def get_documentElement(node): |
| result = None |
| for child in node.childNodes: |
| if child.nodeType == ELEMENT: |
| result = child |
| return result |
| |
| |
| def set_tagName(elem, gi): |
| elem.nodeName = elem.tagName = gi |
| |
| |
| def find_all_elements(doc, gi): |
| nodes = [] |
| if doc.nodeName == gi: |
| nodes.append(doc) |
| for child in doc.childNodes: |
| if child.nodeType == ELEMENT: |
| if child.tagName == gi: |
| nodes.append(child) |
| for node in child.getElementsByTagName(gi): |
| nodes.append(node) |
| return nodes |
| |
| def find_all_child_elements(doc, gi): |
| nodes = [] |
| for child in doc.childNodes: |
| if child.nodeName == gi: |
| nodes.append(child) |
| return nodes |
| |
| |
| def find_all_elements_from_set(doc, gi_set): |
| return __find_all_elements_from_set(doc, gi_set, []) |
| |
| def __find_all_elements_from_set(doc, gi_set, nodes): |
| if doc.nodeName in gi_set: |
| nodes.append(doc) |
| for child in doc.childNodes: |
| if child.nodeType == ELEMENT: |
| __find_all_elements_from_set(child, gi_set, nodes) |
| return nodes |
| |
| |
| def simplify(doc, fragment): |
| # Try to rationalize the document a bit, since these things are simply |
| # not valid SGML/XML documents as they stand, and need a little work. |
| documentclass = "document" |
| inputs = [] |
| node = extract_first_element(fragment, "documentclass") |
| if node is not None: |
| documentclass = node.getAttribute("classname") |
| node = extract_first_element(fragment, "title") |
| if node is not None: |
| inputs.append(node) |
| # update the name of the root element |
| node = get_first_element(fragment, "document") |
| if node is not None: |
| set_tagName(node, documentclass) |
| # Move everything that comes before this node into this node; |
| # this will be the document element. |
| nodelist = fragment.childNodes |
| point = node.firstChild |
| while not nodelist[0].isSameNode(node): |
| node.insertBefore(nodelist[0], point) |
| while 1: |
| node = extract_first_element(fragment, "input") |
| if node is None: |
| break |
| inputs.append(node) |
| if inputs: |
| docelem = get_documentElement(fragment) |
| inputs.reverse() |
| for node in inputs: |
| text = doc.createTextNode("\n") |
| docelem.insertBefore(text, docelem.firstChild) |
| docelem.insertBefore(node, text) |
| docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild) |
| while fragment.firstChild and fragment.firstChild.nodeType == TEXT: |
| fragment.removeChild(fragment.firstChild) |
| |
| |
| def cleanup_root_text(doc): |
| discards = [] |
| skip = 0 |
| for n in doc.childNodes: |
| prevskip = skip |
| skip = 0 |
| if n.nodeType == TEXT and not prevskip: |
| discards.append(n) |
| elif n.nodeName == "COMMENT": |
| skip = 1 |
| for node in discards: |
| doc.removeChild(node) |
| |
| |
| DESCRIPTOR_ELEMENTS = ( |
| "cfuncdesc", "cvardesc", "ctypedesc", |
| "classdesc", "memberdesc", "memberdescni", "methoddesc", "methoddescni", |
| "excdesc", "funcdesc", "funcdescni", "opcodedesc", |
| "datadesc", "datadescni", |
| ) |
| |
| def fixup_descriptors(doc, fragment): |
| sections = find_all_elements(fragment, "section") |
| for section in sections: |
| find_and_fix_descriptors(doc, section) |
| |
| |
| def find_and_fix_descriptors(doc, container): |
| children = container.childNodes |
| for child in children: |
| if child.nodeType == ELEMENT: |
| tagName = child.tagName |
| if tagName in DESCRIPTOR_ELEMENTS: |
| rewrite_descriptor(doc, child) |
| elif tagName == "subsection": |
| find_and_fix_descriptors(doc, child) |
| |
| |
| def rewrite_descriptor(doc, descriptor): |
| # |
| # Do these things: |
| # 1. Add an "index='no'" attribute to the element if the tagName |
| # ends in 'ni', removing the 'ni' from the name. |
| # 2. Create a <signature> from the name attribute |
| # 2a.Create an <args> if it appears to be available. |
| # 3. Create additional <signature>s from <*line{,ni}> elements, |
| # if found. |
| # 4. If a <versionadded> is found, move it to an attribute on the |
| # descriptor. |
| # 5. Move remaining child nodes to a <description> element. |
| # 6. Put it back together. |
| # |
| # 1. |
| descname = descriptor.tagName |
| index = descriptor.getAttribute("name") != "no" |
| desctype = descname[:-4] # remove 'desc' |
| linename = desctype + "line" |
| if not index: |
| linename = linename + "ni" |
| # 2. |
| signature = doc.createElement("signature") |
| name = doc.createElement("name") |
| signature.appendChild(doc.createTextNode("\n ")) |
| signature.appendChild(name) |
| name.appendChild(doc.createTextNode(descriptor.getAttribute("name"))) |
| descriptor.removeAttribute("name") |
| # 2a. |
| if descriptor.hasAttribute("var"): |
| if descname != "opcodedesc": |
| raise RuntimeError("got 'var' attribute on descriptor other than opcodedesc") |
| variable = descriptor.getAttribute("var") |
| if variable: |
| args = doc.createElement("args") |
| args.appendChild(doc.createTextNode(variable)) |
| signature.appendChild(doc.createTextNode("\n ")) |
| signature.appendChild(args) |
| descriptor.removeAttribute("var") |
| newchildren = [signature] |
| children = descriptor.childNodes |
| pos = skip_leading_nodes(children) |
| if pos < len(children): |
| child = children[pos] |
| if child.nodeName == "args": |
| # move <args> to <signature>, or remove if empty: |
| child.parentNode.removeChild(child) |
| if len(child.childNodes): |
| signature.appendChild(doc.createTextNode("\n ")) |
| signature.appendChild(child) |
| signature.appendChild(doc.createTextNode("\n ")) |
| # 3, 4. |
| pos = skip_leading_nodes(children, pos) |
| while pos < len(children) \ |
| and children[pos].nodeName in (linename, "versionadded"): |
| if children[pos].tagName == linename: |
| # this is really a supplemental signature, create <signature> |
| oldchild = children[pos].cloneNode(1) |
| try: |
| sig = methodline_to_signature(doc, children[pos]) |
| except KeyError: |
| print(oldchild.toxml()) |
| raise |
| newchildren.append(sig) |
| else: |
| # <versionadded added=...> |
| descriptor.setAttribute( |
| "added", children[pos].getAttribute("version")) |
| pos = skip_leading_nodes(children, pos + 1) |
| # 5. |
| description = doc.createElement("description") |
| description.appendChild(doc.createTextNode("\n")) |
| newchildren.append(description) |
| move_children(descriptor, description, pos) |
| last = description.childNodes[-1] |
| if last.nodeType == TEXT: |
| last.data = last.data.rstrip() + "\n " |
| # 6. |
| # should have nothing but whitespace and signature lines in <descriptor>; |
| # discard them |
| while descriptor.childNodes: |
| descriptor.removeChild(descriptor.childNodes[0]) |
| for node in newchildren: |
| descriptor.appendChild(doc.createTextNode("\n ")) |
| descriptor.appendChild(node) |
| descriptor.appendChild(doc.createTextNode("\n")) |
| |
| |
| def methodline_to_signature(doc, methodline): |
| signature = doc.createElement("signature") |
| signature.appendChild(doc.createTextNode("\n ")) |
| name = doc.createElement("name") |
| name.appendChild(doc.createTextNode(methodline.getAttribute("name"))) |
| methodline.removeAttribute("name") |
| signature.appendChild(name) |
| if len(methodline.childNodes): |
| args = doc.createElement("args") |
| signature.appendChild(doc.createTextNode("\n ")) |
| signature.appendChild(args) |
| move_children(methodline, args) |
| signature.appendChild(doc.createTextNode("\n ")) |
| return signature |
| |
| |
| def move_children(origin, dest, start=0): |
| children = origin.childNodes |
| while start < len(children): |
| node = children[start] |
| origin.removeChild(node) |
| dest.appendChild(node) |
| |
| |
| def handle_appendix(doc, fragment): |
| # must be called after simplfy() if document is multi-rooted to begin with |
| docelem = get_documentElement(fragment) |
| toplevel = docelem.tagName == "manual" and "chapter" or "section" |
| appendices = 0 |
| nodes = [] |
| for node in docelem.childNodes: |
| if appendices: |
| nodes.append(node) |
| elif node.nodeType == ELEMENT: |
| appnodes = node.getElementsByTagName("appendix") |
| if appnodes: |
| appendices = 1 |
| parent = appnodes[0].parentNode |
| parent.removeChild(appnodes[0]) |
| parent.normalize() |
| if nodes: |
| map(docelem.removeChild, nodes) |
| docelem.appendChild(doc.createTextNode("\n\n\n")) |
| back = doc.createElement("back-matter") |
| docelem.appendChild(back) |
| back.appendChild(doc.createTextNode("\n")) |
| while nodes and nodes[0].nodeType == TEXT \ |
| and not nodes[0].data.strip(): |
| del nodes[0] |
| map(back.appendChild, nodes) |
| docelem.appendChild(doc.createTextNode("\n")) |
| |
| |
| def handle_labels(doc, fragment): |
| for label in find_all_elements(fragment, "label"): |
| id = label.getAttribute("id") |
| if not id: |
| continue |
| parent = label.parentNode |
| parentTagName = parent.tagName |
| if parentTagName == "title": |
| parent.parentNode.setAttribute("id", id) |
| else: |
| parent.setAttribute("id", id) |
| # now, remove <label id="..."/> from parent: |
| parent.removeChild(label) |
| if parentTagName == "title": |
| parent.normalize() |
| children = parent.childNodes |
| if children[-1].nodeType == TEXT: |
| children[-1].data = children[-1].data.rstrip() |
| |
| |
| def fixup_trailing_whitespace(doc, fragment, wsmap): |
| queue = [fragment] |
| fixups = [] |
| while queue: |
| node = queue[0] |
| del queue[0] |
| if node.nodeName in wsmap: |
| fixups.append(node) |
| for child in node.childNodes: |
| if child.nodeType == ELEMENT: |
| queue.append(child) |
| |
| # reverse the list to process from the inside out |
| fixups.reverse() |
| for node in fixups: |
| node.parentNode.normalize() |
| lastchild = node.lastChild |
| before, after = wsmap[node.tagName] |
| if lastchild.nodeType == TEXT: |
| data = lastchild.data.rstrip() + before |
| lastchild.data = data |
| norm = 0 |
| if wsmap[node.tagName]: |
| nextnode = node.nextSibling |
| if nextnode and nextnode.nodeType == TEXT: |
| nextnode.data = after + nextnode.data.lstrip() |
| else: |
| wsnode = doc.createTextNode(after) |
| node.parentNode.insertBefore(wsnode, nextnode) |
| # hack to get the title in place: |
| if node.tagName == "title" \ |
| and node.parentNode.firstChild.nodeType == ELEMENT: |
| node.parentNode.insertBefore(doc.createTextNode("\n "), |
| node.parentNode.firstChild) |
| node.parentNode.normalize() |
| |
| |
| def normalize(doc): |
| for node in doc.childNodes: |
| if node.nodeType == ELEMENT: |
| node.normalize() |
| |
| |
| def cleanup_trailing_parens(doc, element_names): |
| d = {} |
| for gi in element_names: |
| d[gi] = gi |
| rewrite_element = d.has_key |
| queue = [node for node in doc.childNodes if node.nodeType == ELEMENT] |
| while queue: |
| node = queue[0] |
| del queue[0] |
| if rewrite_element(node.tagName): |
| lastchild = node.lastChild |
| if lastchild and lastchild.nodeType == TEXT: |
| data = lastchild.data |
| if data.endswith("()"): |
| lastchild.data = data[:-2] |
| else: |
| for child in node.childNodes: |
| if child.nodeType == ELEMENT: |
| queue.append(child) |
| |
| |
| def contents_match(left, right): |
| left_children = left.childNodes |
| right_children = right.childNodes |
| if len(left_children) != len(right_children): |
| return 0 |
| for l, r in map(None, left_children, right_children): |
| nodeType = l.nodeType |
| if nodeType != r.nodeType: |
| return 0 |
| if nodeType == ELEMENT: |
| if l.tagName != r.tagName: |
| return 0 |
| # should check attributes, but that's not a problem here |
| if not contents_match(l, r): |
| return 0 |
| elif nodeType == TEXT: |
| if l.data != r.data: |
| return 0 |
| else: |
| # not quite right, but good enough |
| return 0 |
| return 1 |
| |
| |
| def create_module_info(doc, section): |
| # Heavy. |
| node = extract_first_element(section, "modulesynopsis") |
| if node is None: |
| return |
| set_tagName(node, "synopsis") |
| lastchild = node.childNodes[-1] |
| if lastchild.nodeType == TEXT \ |
| and lastchild.data[-1:] == ".": |
| lastchild.data = lastchild.data[:-1] |
| modauthor = extract_first_element(section, "moduleauthor") |
| if modauthor: |
| set_tagName(modauthor, "author") |
| modauthor.appendChild(doc.createTextNode( |
| modauthor.getAttribute("name"))) |
| modauthor.removeAttribute("name") |
| platform = extract_first_element(section, "platform") |
| if section.tagName == "section": |
| modinfo_pos = 2 |
| modinfo = doc.createElement("moduleinfo") |
| moddecl = extract_first_element(section, "declaremodule") |
| name = None |
| if moddecl: |
| modinfo.appendChild(doc.createTextNode("\n ")) |
| name = moddecl.attributes["name"].value |
| namenode = doc.createElement("name") |
| namenode.appendChild(doc.createTextNode(name)) |
| modinfo.appendChild(namenode) |
| type = moddecl.attributes.get("type") |
| if type: |
| type = type.value |
| modinfo.appendChild(doc.createTextNode("\n ")) |
| typenode = doc.createElement("type") |
| typenode.appendChild(doc.createTextNode(type)) |
| modinfo.appendChild(typenode) |
| versionadded = extract_first_element(section, "versionadded") |
| if versionadded: |
| modinfo.setAttribute("added", versionadded.getAttribute("version")) |
| title = get_first_element(section, "title") |
| if title: |
| children = title.childNodes |
| if len(children) >= 2 \ |
| and children[0].nodeName == "module" \ |
| and children[0].childNodes[0].data == name: |
| # this is it; morph the <title> into <short-synopsis> |
| first_data = children[1] |
| if first_data.data[:4] == " ---": |
| first_data.data = first_data.data[4:].lstrip() |
| set_tagName(title, "short-synopsis") |
| if children[-1].nodeType == TEXT \ |
| and children[-1].data[-1:] == ".": |
| children[-1].data = children[-1].data[:-1] |
| section.removeChild(title) |
| section.removeChild(section.childNodes[0]) |
| title.removeChild(children[0]) |
| modinfo_pos = 0 |
| else: |
| ewrite("module name in title doesn't match" |
| " <declaremodule/>; no <short-synopsis/>\n") |
| else: |
| ewrite("Unexpected condition: <section/> without <title/>\n") |
| modinfo.appendChild(doc.createTextNode("\n ")) |
| modinfo.appendChild(node) |
| if title and not contents_match(title, node): |
| # The short synopsis is actually different, |
| # and needs to be stored: |
| modinfo.appendChild(doc.createTextNode("\n ")) |
| modinfo.appendChild(title) |
| if modauthor: |
| modinfo.appendChild(doc.createTextNode("\n ")) |
| modinfo.appendChild(modauthor) |
| if platform: |
| modinfo.appendChild(doc.createTextNode("\n ")) |
| modinfo.appendChild(platform) |
| modinfo.appendChild(doc.createTextNode("\n ")) |
| section.insertBefore(modinfo, section.childNodes[modinfo_pos]) |
| section.insertBefore(doc.createTextNode("\n "), modinfo) |
| # |
| # The rest of this removes extra newlines from where we cut out |
| # a lot of elements. A lot of code for minimal value, but keeps |
| # keeps the generated *ML from being too funny looking. |
| # |
| section.normalize() |
| children = section.childNodes |
| for i in range(len(children)): |
| node = children[i] |
| if node.nodeName == "moduleinfo": |
| nextnode = children[i+1] |
| if nextnode.nodeType == TEXT: |
| data = nextnode.data |
| s = data.lstrip() |
| if len(s) < (len(data) - 4): |
| nextnode.data = "\n\n\n" + s |
| |
| |
| def cleanup_synopses(doc, fragment): |
| for node in find_all_elements(fragment, "section"): |
| create_module_info(doc, node) |
| |
| |
| def fixup_table_structures(doc, fragment): |
| for table in find_all_elements(fragment, "table"): |
| fixup_table(doc, table) |
| |
| |
| def fixup_table(doc, table): |
| # create the table head |
| thead = doc.createElement("thead") |
| row = doc.createElement("row") |
| move_elements_by_name(doc, table, row, "entry") |
| thead.appendChild(doc.createTextNode("\n ")) |
| thead.appendChild(row) |
| thead.appendChild(doc.createTextNode("\n ")) |
| # create the table body |
| tbody = doc.createElement("tbody") |
| prev_row = None |
| last_was_hline = 0 |
| children = table.childNodes |
| for child in children: |
| if child.nodeType == ELEMENT: |
| tagName = child.tagName |
| if tagName == "hline" and prev_row is not None: |
| prev_row.setAttribute("rowsep", "1") |
| elif tagName == "row": |
| prev_row = child |
| # save the rows: |
| tbody.appendChild(doc.createTextNode("\n ")) |
| move_elements_by_name(doc, table, tbody, "row", sep="\n ") |
| # and toss the rest: |
| while children: |
| child = children[0] |
| nodeType = child.nodeType |
| if nodeType == TEXT: |
| if child.data.strip(): |
| raise ConversionError("unexpected free data in <%s>: %r" |
| % (table.tagName, child.data)) |
| table.removeChild(child) |
| continue |
| if nodeType == ELEMENT: |
| if child.tagName != "hline": |
| raise ConversionError( |
| "unexpected <%s> in table" % child.tagName) |
| table.removeChild(child) |
| continue |
| raise ConversionError( |
| "unexpected %s node in table" % child.__class__.__name__) |
| # nothing left in the <table>; add the <thead> and <tbody> |
| tgroup = doc.createElement("tgroup") |
| tgroup.appendChild(doc.createTextNode("\n ")) |
| tgroup.appendChild(thead) |
| tgroup.appendChild(doc.createTextNode("\n ")) |
| tgroup.appendChild(tbody) |
| tgroup.appendChild(doc.createTextNode("\n ")) |
| table.appendChild(tgroup) |
| # now make the <entry>s look nice: |
| for row in table.getElementsByTagName("row"): |
| fixup_row(doc, row) |
| |
| |
| def fixup_row(doc, row): |
| entries = [] |
| map(entries.append, row.childNodes[1:]) |
| for entry in entries: |
| row.insertBefore(doc.createTextNode("\n "), entry) |
| # row.appendChild(doc.createTextNode("\n ")) |
| |
| |
| def move_elements_by_name(doc, source, dest, name, sep=None): |
| nodes = [] |
| for child in source.childNodes: |
| if child.nodeName == name: |
| nodes.append(child) |
| for node in nodes: |
| source.removeChild(node) |
| dest.appendChild(node) |
| if sep: |
| dest.appendChild(doc.createTextNode(sep)) |
| |
| |
| RECURSE_INTO_PARA_CONTAINERS = ( |
| "chapter", "abstract", "enumerate", |
| "section", "subsection", "subsubsection", |
| "paragraph", "subparagraph", "back-matter", |
| "howto", "manual", |
| "item", "itemize", "fulllineitems", "enumeration", "descriptionlist", |
| "definitionlist", "definition", |
| ) |
| |
| PARA_LEVEL_ELEMENTS = ( |
| "moduleinfo", "title", "verbatim", "enumerate", "item", |
| "interpreter-session", "back-matter", "interactive-session", |
| "opcodedesc", "classdesc", "datadesc", |
| "cfuncdesc", "ctypedesc", "cvardesc", |
| "funcdesc", "methoddesc", "excdesc", "memberdesc", "membderdescni", |
| "funcdescni", "methoddescni", "excdescni", |
| "tableii", "tableiii", "tableiv", "localmoduletable", |
| "sectionauthor", "seealso", "itemize", |
| # include <para>, so we can just do it again to get subsequent paras: |
| PARA_ELEMENT, |
| ) |
| |
| PARA_LEVEL_PRECEEDERS = ( |
| "setindexsubitem", "author", |
| "stindex", "obindex", "COMMENT", "label", "xi:include", "title", |
| "versionadded", "versionchanged", "declaremodule", "modulesynopsis", |
| "moduleauthor", "indexterm", "leader", |
| ) |
| |
| |
| def fixup_paras(doc, fragment): |
| for child in fragment.childNodes: |
| if child.nodeName in RECURSE_INTO_PARA_CONTAINERS: |
| fixup_paras_helper(doc, child) |
| descriptions = find_all_elements(fragment, "description") |
| for description in descriptions: |
| fixup_paras_helper(doc, description) |
| |
| |
| def fixup_paras_helper(doc, container, depth=0): |
| # document is already normalized |
| children = container.childNodes |
| start = skip_leading_nodes(children) |
| while len(children) > start: |
| if children[start].nodeName in RECURSE_INTO_PARA_CONTAINERS: |
| # Something to recurse into: |
| fixup_paras_helper(doc, children[start]) |
| else: |
| # Paragraph material: |
| build_para(doc, container, start, len(children)) |
| if DEBUG_PARA_FIXER and depth == 10: |
| sys.exit(1) |
| start = skip_leading_nodes(children, start + 1) |
| |
| |
| def build_para(doc, parent, start, i): |
| children = parent.childNodes |
| after = start + 1 |
| have_last = 0 |
| BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + RECURSE_INTO_PARA_CONTAINERS |
| # Collect all children until \n\n+ is found in a text node or a |
| # member of BREAK_ELEMENTS is found. |
| for j in range(start, i): |
| after = j + 1 |
| child = children[j] |
| nodeType = child.nodeType |
| if nodeType == ELEMENT: |
| if child.tagName in BREAK_ELEMENTS: |
| after = j |
| break |
| elif nodeType == TEXT: |
| pos = child.data.find("\n\n") |
| if pos == 0: |
| after = j |
| break |
| if pos >= 1: |
| child.splitText(pos) |
| break |
| else: |
| have_last = 1 |
| if (start + 1) > after: |
| raise ConversionError( |
| "build_para() could not identify content to turn into a paragraph") |
| if children[after - 1].nodeType == TEXT: |
| # we may need to split off trailing white space: |
| child = children[after - 1] |
| data = child.data |
| if data.rstrip() != data: |
| have_last = 0 |
| child.splitText(len(data.rstrip())) |
| para = doc.createElement(PARA_ELEMENT) |
| prev = None |
| indexes = range(start, after) |
| indexes.reverse() |
| for j in indexes: |
| node = parent.childNodes[j] |
| parent.removeChild(node) |
| para.insertBefore(node, prev) |
| prev = node |
| if have_last: |
| parent.appendChild(para) |
| parent.appendChild(doc.createTextNode("\n\n")) |
| return len(parent.childNodes) |
| else: |
| nextnode = parent.childNodes[start] |
| if nextnode.nodeType == TEXT: |
| if nextnode.data and nextnode.data[0] != "\n": |
| nextnode.data = "\n" + nextnode.data |
| else: |
| newnode = doc.createTextNode("\n") |
| parent.insertBefore(newnode, nextnode) |
| nextnode = newnode |
| start = start + 1 |
| parent.insertBefore(para, nextnode) |
| return start + 1 |
| |
| |
| def skip_leading_nodes(children, start=0): |
| """Return index into children of a node at which paragraph building should |
| begin or a recursive call to fixup_paras_helper() should be made (for |
| subsections, etc.). |
| |
| When the return value >= len(children), we've built all the paras we can |
| from this list of children. |
| """ |
| i = len(children) |
| while i > start: |
| # skip over leading comments and whitespace: |
| child = children[start] |
| nodeType = child.nodeType |
| if nodeType == TEXT: |
| data = child.data |
| shortened = data.lstrip() |
| if shortened: |
| if data != shortened: |
| # break into two nodes: whitespace and non-whitespace |
| child.splitText(len(data) - len(shortened)) |
| return start + 1 |
| return start |
| # all whitespace, just skip |
| elif nodeType == ELEMENT: |
| tagName = child.tagName |
| if tagName in RECURSE_INTO_PARA_CONTAINERS: |
| return start |
| if tagName not in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS: |
| return start |
| start = start + 1 |
| return start |
| |
| |
| def fixup_rfc_references(doc, fragment): |
| for rfcnode in find_all_elements_from_set(fragment, ("pep", "rfc")): |
| rfcnode.appendChild(doc.createTextNode( |
| rfcnode.tagName.upper() + " " + rfcnode.getAttribute("num"))) |
| |
| |
| def fixup_signatures(doc, fragment): |
| for child in fragment.childNodes: |
| if child.nodeType == ELEMENT: |
| args = child.getElementsByTagName("args") |
| for arg in args: |
| rewrite_args(doc, arg) |
| args = child.getElementsByTagName("constructor-args") |
| for arg in args: |
| rewrite_args(doc, arg) |
| |
| def rewrite_args(doc, arglist): |
| fixup_args(doc, arglist) |
| arglist.normalize() |
| if arglist.childNodes.length == 1 and arglist.firstChild.nodeType == TEXT: |
| node = arglist.firstChild |
| node.data = ' '.join(node.data.split()) |
| |
| def fixup_args(doc, arglist): |
| for child in arglist.childNodes: |
| if child.nodeName == "optional": |
| # found it; fix and return |
| arglist.insertBefore(doc.createTextNode("["), child) |
| optkids = child.childNodes |
| while optkids: |
| arglist.insertBefore(child.firstChild, child) |
| arglist.insertBefore(doc.createTextNode("]"), child) |
| arglist.removeChild(child) |
| return fixup_args(doc, arglist) |
| |
| |
| def fixup_sectionauthors(doc, fragment): |
| for sectauth in find_all_elements(fragment, "sectionauthor"): |
| section = sectauth.parentNode |
| section.removeChild(sectauth) |
| set_tagName(sectauth, "author") |
| sectauth.appendChild(doc.createTextNode( |
| sectauth.getAttribute("name"))) |
| sectauth.removeAttribute("name") |
| after = section.childNodes[2] |
| title = section.childNodes[1] |
| if title.nodeName != "title": |
| after = section.childNodes[0] |
| section.insertBefore(doc.createTextNode("\n "), after) |
| section.insertBefore(sectauth, after) |
| |
| |
| def fixup_verbatims(doc): |
| for verbatim in find_all_elements(doc, "verbatim"): |
| child = verbatim.childNodes[0] |
| if child.nodeType == TEXT \ |
| and child.data.lstrip().startswith(">>>"): |
| set_tagName(verbatim, "interactive-session") |
| |
| |
| def add_node_ids(fragment, counter=0): |
| fragment.node_id = counter |
| for node in fragment.childNodes: |
| counter = counter + 1 |
| if node.nodeType == ELEMENT: |
| counter = add_node_ids(node, counter) |
| else: |
| node.node_id = counter |
| return counter + 1 |
| |
| |
| def fixup_ulink(doc, fragment): |
| for ulink in find_all_elements(fragment, "ulink"): |
| children = ulink.childNodes |
| assert len(children) == 2 |
| text = children[0] |
| href = children[1] |
| href.normalize() |
| assert len(href.childNodes) == 1 |
| assert href.childNodes[0].nodeType == TEXT |
| url = href.childNodes[0].data |
| ulink.setAttribute("href", url) |
| ulink.removeChild(href) |
| content = text.childNodes |
| while len(content): |
| ulink.appendChild(content[0]) |
| ulink.removeChild(text) |
| |
| |
| REFMODINDEX_ELEMENTS = ('refmodindex', 'refbimodindex', |
| 'refexmodindex', 'refstmodindex') |
| |
| def fixup_refmodindexes(fragment): |
| # Locate <ref*modindex>...</> co-located with <module>...</>, and |
| # remove the <ref*modindex>, replacing it with index=index on the |
| # <module> element. |
| nodes = find_all_elements_from_set(fragment, REFMODINDEX_ELEMENTS) |
| d = {} |
| for node in nodes: |
| parent = node.parentNode |
| d[parent.node_id] = parent |
| del nodes |
| map(fixup_refmodindexes_chunk, d.values()) |
| |
| |
| def fixup_refmodindexes_chunk(container): |
| # node is probably a <para>; let's see how often it isn't: |
| if container.tagName != PARA_ELEMENT: |
| bwrite("--- fixup_refmodindexes_chunk(%s)\n" % container) |
| module_entries = find_all_elements(container, "module") |
| if not module_entries: |
| return |
| index_entries = find_all_elements_from_set(container, REFMODINDEX_ELEMENTS) |
| removes = [] |
| for entry in index_entries: |
| children = entry.childNodes |
| if len(children) != 0: |
| bwrite("--- unexpected number of children for %s node:\n" |
| % entry.tagName) |
| ewrite(entry.toxml() + "\n") |
| continue |
| found = 0 |
| module_name = entry.getAttribute("module") |
| for node in module_entries: |
| if len(node.childNodes) != 1: |
| continue |
| this_name = node.childNodes[0].data |
| if this_name == module_name: |
| found = 1 |
| node.setAttribute("index", "yes") |
| if found: |
| removes.append(entry) |
| for node in removes: |
| container.removeChild(node) |
| |
| |
| def fixup_bifuncindexes(fragment): |
| nodes = find_all_elements(fragment, 'bifuncindex') |
| d = {} |
| # make sure that each parent is only processed once: |
| for node in nodes: |
| parent = node.parentNode |
| d[parent.node_id] = parent |
| del nodes |
| map(fixup_bifuncindexes_chunk, d.values()) |
| |
| |
| def fixup_bifuncindexes_chunk(container): |
| removes = [] |
| entries = find_all_child_elements(container, "bifuncindex") |
| function_entries = find_all_child_elements(container, "function") |
| for entry in entries: |
| function_name = entry.getAttribute("name") |
| found = 0 |
| for func_entry in function_entries: |
| t2 = func_entry.childNodes[0].data |
| if t2[-2:] != "()": |
| continue |
| t2 = t2[:-2] |
| if t2 == function_name: |
| func_entry.setAttribute("index", "yes") |
| func_entry.setAttribute("module", "__builtin__") |
| if not found: |
| found = 1 |
| removes.append(entry) |
| for entry in removes: |
| container.removeChild(entry) |
| |
| |
| def join_adjacent_elements(container, gi): |
| queue = [container] |
| while queue: |
| parent = queue.pop() |
| i = 0 |
| children = parent.childNodes |
| nchildren = len(children) |
| while i < (nchildren - 1): |
| child = children[i] |
| if child.nodeName == gi: |
| if children[i+1].nodeName == gi: |
| ewrite("--- merging two <%s/> elements\n" % gi) |
| child = children[i] |
| nextchild = children[i+1] |
| nextchildren = nextchild.childNodes |
| while len(nextchildren): |
| node = nextchildren[0] |
| nextchild.removeChild(node) |
| child.appendChild(node) |
| parent.removeChild(nextchild) |
| continue |
| if child.nodeType == ELEMENT: |
| queue.append(child) |
| i = i + 1 |
| |
| |
| _token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$") |
| |
| def write_esis(doc, ofp, knownempty): |
| for node in doc.childNodes: |
| nodeType = node.nodeType |
| if nodeType == ELEMENT: |
| gi = node.tagName |
| if knownempty(gi): |
| if node.hasChildNodes(): |
| raise ValueError("declared-empty node <%s> has children" % gi) |
| ofp.write("e\n") |
| for k, value in node.attributes.items(): |
| if _token_rx.match(value): |
| dtype = "TOKEN" |
| else: |
| dtype = "CDATA" |
| ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value))) |
| ofp.write("(%s\n" % gi) |
| write_esis(node, ofp, knownempty) |
| ofp.write(")%s\n" % gi) |
| elif nodeType == TEXT: |
| ofp.write("-%s\n" % esistools.encode(node.data)) |
| elif nodeType == ENTITY_REFERENCE: |
| ofp.write("&%s\n" % node.nodeName) |
| else: |
| raise RuntimeError("unsupported node type: %s" % nodeType) |
| |
| |
| def convert(ifp, ofp): |
| events = esistools.parse(ifp) |
| toktype, doc = events.getEvent() |
| fragment = doc.createDocumentFragment() |
| events.expandNode(fragment) |
| |
| normalize(fragment) |
| simplify(doc, fragment) |
| handle_labels(doc, fragment) |
| handle_appendix(doc, fragment) |
| fixup_trailing_whitespace(doc, fragment, { |
| # element -> (before-end-tag, after-end-tag) |
| "abstract": ("\n", "\n"), |
| "title": ("", "\n"), |
| "chapter": ("\n", "\n\n\n"), |
| "section": ("\n", "\n\n\n"), |
| "subsection": ("\n", "\n\n"), |
| "subsubsection": ("\n", "\n\n"), |
| "paragraph": ("\n", "\n\n"), |
| "subparagraph": ("\n", "\n\n"), |
| "description": ("\n", "\n\n"), |
| "enumeration": ("\n", "\n\n"), |
| "item": ("\n", "\n\n"), |
| }) |
| cleanup_root_text(doc) |
| cleanup_trailing_parens(fragment, ["function", "method", "cfunction"]) |
| cleanup_synopses(doc, fragment) |
| fixup_descriptors(doc, fragment) |
| fixup_verbatims(fragment) |
| normalize(fragment) |
| fixup_paras(doc, fragment) |
| fixup_sectionauthors(doc, fragment) |
| fixup_table_structures(doc, fragment) |
| fixup_rfc_references(doc, fragment) |
| fixup_signatures(doc, fragment) |
| fixup_ulink(doc, fragment) |
| add_node_ids(fragment) |
| fixup_refmodindexes(fragment) |
| fixup_bifuncindexes(fragment) |
| # Take care of ugly hacks in the LaTeX markup to avoid LaTeX and |
| # LaTeX2HTML screwing with GNU-style long options (the '--' problem). |
| join_adjacent_elements(fragment, "option") |
| # Attempt to avoid trailing blank lines: |
| fragment.normalize() |
| if fragment.lastChild.data[-1:] == "\n": |
| fragment.lastChild.data = fragment.lastChild.data.rstrip() + "\n" |
| # |
| d = {} |
| for gi in events.parser.get_empties(): |
| d[gi] = gi |
| for key in ("author", "pep", "rfc"): |
| if key in d: |
| del d[key] |
| knownempty = d.has_key |
| # |
| try: |
| write_esis(fragment, ofp, knownempty) |
| except IOError as e: |
| (err, msg) = e |
| # Ignore EPIPE; it just means that whoever we're writing to stopped |
| # reading. The rest of the output would be ignored. All other errors |
| # should still be reported, |
| if err != errno.EPIPE: |
| raise |
| |
| |
| def main(): |
| if len(sys.argv) == 1: |
| ifp = sys.stdin |
| ofp = sys.stdout |
| elif len(sys.argv) == 2: |
| ifp = open(sys.argv[1]) |
| ofp = sys.stdout |
| elif len(sys.argv) == 3: |
| ifp = open(sys.argv[1]) |
| import StringIO |
| ofp = StringIO.StringIO() |
| else: |
| usage() |
| sys.exit(2) |
| convert(ifp, ofp) |
| if len(sys.argv) == 3: |
| fp = open(sys.argv[2], "w") |
| fp.write(ofp.getvalue()) |
| fp.close() |
| ofp.close() |
| |
| |
| if __name__ == "__main__": |
| main() |