blob: 802f3b3e94709dec730cda3b44c7bf09943faf75 [file] [log] [blame]
Fred Drake03204731998-11-23 17:02:03 +00001#! /usr/bin/env python
2
3"""Promote the IDs from <label/> elements to the enclosing section / chapter /
4whatever, then remove the <label/> elements. This allows *ML style internal
5linking rather than the bogus LaTeX model.
6
7Note that <label/>s in <title> elements are promoted two steps, since the
8<title> elements are artificially created from the section parameter, and the
9label really refers to the sectioning construct.
10"""
11__version__ = '$Revision$'
12
13
14import errno
Fred Drake4db5b461998-12-01 19:03:01 +000015import esistools
16import re
Fred Drake03204731998-11-23 17:02:03 +000017import string
18import sys
19import xml.dom.core
20import xml.dom.esis_builder
21
22
Fred Drakefcc59101999-01-06 22:50:52 +000023DEBUG_PARA_FIXER = 0
24
25
Fred Drake03204731998-11-23 17:02:03 +000026# Workaround to deal with invalid documents (multiple root elements). This
27# does not indicate a bug in the DOM implementation.
28#
29def get_documentElement(self):
30 docelem = None
31 for n in self._node.children:
32 if n.type == xml.dom.core.ELEMENT:
33 docelem = xml.dom.core.Element(n, self, self)
34 return docelem
35
36xml.dom.core.Document.get_documentElement = get_documentElement
37
38
39# Replace get_childNodes for the Document class; without this, children
40# accessed from the Document object via .childNodes (no matter how many
41# levels of access are used) will be given an ownerDocument of None.
42#
43def get_childNodes(self):
44 return xml.dom.core.NodeList(self._node.children, self, self)
45
46xml.dom.core.Document.get_childNodes = get_childNodes
47
48
49def get_first_element(doc, gi):
50 for n in doc.childNodes:
51 if n.nodeType == xml.dom.core.ELEMENT and n.tagName == gi:
52 return n
53
54def extract_first_element(doc, gi):
55 node = get_first_element(doc, gi)
56 if node is not None:
57 doc.removeChild(node)
58 return node
59
60
61def simplify(doc):
62 # Try to rationalize the document a bit, since these things are simply
63 # not valid SGML/XML documents as they stand, and need a little work.
64 documentclass = "document"
65 inputs = []
66 node = extract_first_element(doc, "documentclass")
67 if node is not None:
68 documentclass = node.getAttribute("classname")
69 node = extract_first_element(doc, "title")
70 if node is not None:
71 inputs.append(node)
72 # update the name of the root element
73 node = get_first_element(doc, "document")
74 if node is not None:
75 node._node.name = documentclass
76 while 1:
77 node = extract_first_element(doc, "input")
78 if node is None:
79 break
80 inputs.append(node)
81 if inputs:
82 docelem = doc.documentElement
83 inputs.reverse()
84 for node in inputs:
85 text = doc.createTextNode("\n")
86 docelem.insertBefore(text, docelem.firstChild)
87 docelem.insertBefore(node, text)
88 docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
89 while doc.firstChild.nodeType == xml.dom.core.TEXT:
90 doc.removeChild(doc.firstChild)
91
92
93def cleanup_root_text(doc):
94 discards = []
95 skip = 0
96 for n in doc.childNodes:
97 prevskip = skip
98 skip = 0
99 if n.nodeType == xml.dom.core.TEXT and not prevskip:
100 discards.append(n)
Fred Drake4db5b461998-12-01 19:03:01 +0000101 elif n.nodeType == xml.dom.core.ELEMENT and n.tagName == "COMMENT":
Fred Drake03204731998-11-23 17:02:03 +0000102 skip = 1
103 for node in discards:
104 doc.removeChild(node)
105
106
107def rewrite_desc_entries(doc, argname_gi):
108 argnodes = doc.getElementsByTagName(argname_gi)
109 for node in argnodes:
110 parent = node.parentNode
111 nodes = []
112 for n in parent.childNodes:
113 if n.nodeType != xml.dom.core.ELEMENT or n.tagName != argname_gi:
114 nodes.append(n)
115 desc = doc.createElement("description")
116 for n in nodes:
117 parent.removeChild(n)
118 desc.appendChild(n)
119 if node.childNodes:
120 # keep the <args>...</args>, newline & indent
121 parent.insertBefore(doc.createText("\n "), node)
122 else:
123 # no arguments, remove the <args/> node
124 parent.removeChild(node)
125 parent.appendChild(doc.createText("\n "))
126 parent.appendChild(desc)
127 parent.appendChild(doc.createText("\n"))
128
129def handle_args(doc):
130 rewrite_desc_entries(doc, "args")
131 rewrite_desc_entries(doc, "constructor-args")
132
133
Fred Drake4db5b461998-12-01 19:03:01 +0000134def handle_appendix(doc):
135 # must be called after simplfy() if document is multi-rooted to begin with
136 docelem = doc.documentElement
137 toplevel = docelem.tagName == "manual" and "chapter" or "section"
138 appendices = 0
139 nodes = []
140 for node in docelem.childNodes:
141 if appendices:
142 nodes.append(node)
143 elif node.nodeType == xml.dom.core.ELEMENT:
144 appnodes = node.getElementsByTagName("appendix")
145 if appnodes:
146 appendices = 1
147 parent = appnodes[0].parentNode
148 parent.removeChild(appnodes[0])
149 parent.normalize()
150 if nodes:
151 map(docelem.removeChild, nodes)
152 docelem.appendChild(doc.createTextNode("\n\n\n"))
153 back = doc.createElement("back-matter")
154 docelem.appendChild(back)
155 back.appendChild(doc.createTextNode("\n"))
156 while nodes and nodes[0].nodeType == xml.dom.core.TEXT \
157 and not string.strip(nodes[0].data):
158 del nodes[0]
159 map(back.appendChild, nodes)
160 docelem.appendChild(doc.createTextNode("\n"))
Fred Drake03204731998-11-23 17:02:03 +0000161
162
163def handle_labels(doc):
164 labels = doc.getElementsByTagName("label")
165 for label in labels:
166 id = label.getAttribute("id")
167 if not id:
168 continue
169 parent = label.parentNode
170 if parent.tagName == "title":
171 parent.parentNode.setAttribute("id", id)
172 else:
173 parent.setAttribute("id", id)
174 # now, remove <label id="..."/> from parent:
175 parent.removeChild(label)
176
177
Fred Drake1ff6db41998-11-23 23:10:35 +0000178def fixup_trailing_whitespace(doc, wsmap):
179 queue = [doc]
180 while queue:
181 node = queue[0]
182 del queue[0]
183 if node.nodeType == xml.dom.core.ELEMENT \
184 and wsmap.has_key(node.tagName):
185 ws = wsmap[node.tagName]
186 children = node.childNodes
187 children.reverse()
188 if children[0].nodeType == xml.dom.core.TEXT:
189 data = string.rstrip(children[0].data) + ws
190 children[0].data = data
191 children.reverse()
192 # hack to get the title in place:
193 if node.tagName == "title" \
194 and node.parentNode.firstChild.nodeType == xml.dom.core.ELEMENT:
195 node.parentNode.insertBefore(doc.createText("\n "),
196 node.parentNode.firstChild)
197 for child in node.childNodes:
198 if child.nodeType == xml.dom.core.ELEMENT:
199 queue.append(child)
200
201
202def normalize(doc):
203 for node in doc.childNodes:
204 if node.nodeType == xml.dom.core.ELEMENT:
205 node.normalize()
206
207
208def cleanup_trailing_parens(doc, element_names):
209 d = {}
210 for gi in element_names:
211 d[gi] = gi
212 rewrite_element = d.has_key
213 queue = []
214 for node in doc.childNodes:
215 if node.nodeType == xml.dom.core.ELEMENT:
216 queue.append(node)
217 while queue:
218 node = queue[0]
219 del queue[0]
220 if rewrite_element(node.tagName):
221 children = node.childNodes
222 if len(children) == 1 \
223 and children[0].nodeType == xml.dom.core.TEXT:
224 data = children[0].data
225 if data[-2:] == "()":
226 children[0].data = data[:-2]
227 else:
228 for child in node.childNodes:
229 if child.nodeType == xml.dom.core.ELEMENT:
230 queue.append(child)
231
232
Fred Drakeaaed9711998-12-10 20:25:30 +0000233def contents_match(left, right):
234 left_children = left.childNodes
235 right_children = right.childNodes
236 if len(left_children) != len(right_children):
237 return 0
238 for l, r in map(None, left_children, right_children):
239 nodeType = l.nodeType
240 if nodeType != r.nodeType:
241 return 0
242 if nodeType == xml.dom.core.ELEMENT:
243 if l.tagName != r.tagName:
244 return 0
245 # should check attributes, but that's not a problem here
246 if not contents_match(l, r):
247 return 0
248 elif nodeType == xml.dom.core.TEXT:
249 if l.data != r.data:
250 return 0
251 else:
252 # not quite right, but good enough
253 return 0
254 return 1
255
256
257def create_module_info(doc, section):
258 # Heavy.
259 node = extract_first_element(section, "modulesynopsis")
260 if node is None:
261 return
262 node._node.name = "synopsis"
263 lastchild = node.childNodes[-1]
264 if lastchild.nodeType == xml.dom.core.TEXT \
265 and lastchild.data[-1:] == ".":
266 lastchild.data = lastchild.data[:-1]
267 if section.tagName == "section":
268 modinfo_pos = 2
269 modinfo = doc.createElement("moduleinfo")
270 moddecl = extract_first_element(section, "declaremodule")
271 name = None
272 if moddecl:
273 modinfo.appendChild(doc.createTextNode("\n "))
274 name = moddecl.attributes["name"].value
275 namenode = doc.createElement("name")
276 namenode.appendChild(doc.createTextNode(name))
277 modinfo.appendChild(namenode)
278 type = moddecl.attributes.get("type")
279 if type:
280 type = type.value
281 modinfo.appendChild(doc.createTextNode("\n "))
282 typenode = doc.createElement("type")
283 typenode.appendChild(doc.createTextNode(type))
284 modinfo.appendChild(typenode)
285 title = get_first_element(section, "title")
286 if title:
287 children = title.childNodes
288 if len(children) >= 2 \
289 and children[0].nodeType == xml.dom.core.ELEMENT \
290 and children[0].tagName == "module" \
291 and children[0].childNodes[0].data == name:
292 # this is it; morph the <title> into <short-synopsis>
293 first_data = children[1]
294 if first_data.data[:4] == " ---":
295 first_data.data = string.lstrip(first_data.data[4:])
296 title._node.name = "short-synopsis"
297 if children[-1].data[-1:] == ".":
298 children[-1].data = children[-1].data[:-1]
299 section.removeChild(title)
300 section.removeChild(section.childNodes[0])
301 title.removeChild(children[0])
302 modinfo_pos = 0
303 else:
304 sys.stderr.write(
305 "module name in title doesn't match"
306 " <declaremodule>; no <short-synopsis>\n")
307 else:
308 sys.stderr.write(
309 "Unexpected condition: <section> without <title>\n")
310 modinfo.appendChild(doc.createTextNode("\n "))
311 modinfo.appendChild(node)
312 if title and not contents_match(title, node):
313 # The short synopsis is actually different,
314 # and needs to be stored:
315 modinfo.appendChild(doc.createTextNode("\n "))
316 modinfo.appendChild(title)
317 modinfo.appendChild(doc.createTextNode("\n "))
318 section.insertBefore(modinfo, section.childNodes[modinfo_pos])
319 section.insertBefore(doc.createTextNode("\n "), modinfo)
320
321
Fred Drakefba0ba21998-12-10 05:07:09 +0000322def cleanup_synopses(doc):
Fred Drakeaaed9711998-12-10 20:25:30 +0000323 for node in doc.childNodes:
324 if node.nodeType == xml.dom.core.ELEMENT \
325 and node.tagName == "section":
326 create_module_info(doc, node)
327
328
Fred Drakefcc59101999-01-06 22:50:52 +0000329FIXUP_PARA_ELEMENTS = (
330 "chapter",
331 "section", "subsection", "subsubsection",
332 "paragraph", "subparagraph")
333
334PARA_LEVEL_ELEMENTS = (
335 "moduleinfo", "title", "opcodedesc",
336 "verbatim", "funcdesc", "methoddesc", "excdesc", "datadesc",
337 "funcdescni", "methoddescni", "excdescni", "datadescni",
338 "tableii", "tableiii", "tableiv", "localmoduletable",
339 "sectionauthor",
340 # include <para>, so we can just do it again to get subsequent paras:
341 "para",
342 )
343
344PARA_LEVEL_PRECEEDERS = (
345 "index", "indexii", "indexiii", "indexiv",
346 "stindex", "obindex", "COMMENT", "label",
347 )
348
Fred Drakeaaed9711998-12-10 20:25:30 +0000349def fixup_paras(doc):
Fred Drakefcc59101999-01-06 22:50:52 +0000350 for child in doc.childNodes:
351 if child.nodeType == xml.dom.core.ELEMENT \
352 and child.tagName in FIXUP_PARA_ELEMENTS:
353 fixup_paras_helper(doc, child)
354 descriptions = child.getElementsByTagName("description")
355 for description in descriptions:
356 if DEBUG_PARA_FIXER:
357 sys.stderr.write("-- Fixing up <description> element...\n")
358 fixup_paras_helper(doc, description)
359
360
361def fixup_paras_helper(doc, container):
362 # document is already normalized
363 children = container.childNodes
364 start = 0
365 start_fixed = 0
366 i = 0
367 SKIP_ELEMENTS = PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS
368 for child in children:
369 if child.nodeType == xml.dom.core.ELEMENT:
370 if child.tagName in FIXUP_PARA_ELEMENTS:
371 fixup_paras_helper(doc, child)
372 break
373 elif child.tagName in SKIP_ELEMENTS:
374 if not start_fixed:
375 start = i + 1
376 elif not start_fixed:
377 start_fixed = 1
378 i = i + 1
379 else:
380 if child.nodeType == xml.dom.core.TEXT \
381 and string.strip(child.data) and not start_fixed:
382 start_fixed = 1
383 i = i + 1
384 if DEBUG_PARA_FIXER:
385 sys.stderr.write("fixup_paras_helper() called on <%s>; %d, %d\n"
386 % (container.tagName, start, i))
387 if i > start:
388 # the first [start:i] children shoudl be rewritten as <para> elements
389 # start by breaking text nodes that contain \n\n+ into multiple nodes
390 nstart, i = skip_leading_nodes(container.childNodes, start, i)
391 if i > nstart:
392 build_para(doc, container, nstart, i)
393 fixup_paras_helper(doc, container)
394
395
396def build_para(doc, parent, start, i):
397 children = parent.childNodes
398 # collect all children until \n\n+ is found in a text node or a
399 # PARA_LEVEL_ELEMENT is found.
400 after = start + 1
401 have_last = 0
402 BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + FIXUP_PARA_ELEMENTS
403 for j in range(start, i):
404 after = j + 1
405 child = children[j]
406 nodeType = child.nodeType
407 if nodeType == xml.dom.core.ELEMENT:
408 if child.tagName in BREAK_ELEMENTS:
409 after = j
410 break
411 elif nodeType == xml.dom.core.TEXT:
412 pos = string.find(child.data, "\n\n")
413 if pos == 0:
414 after = j
415 break
416 if pos >= 1:
417 child.splitText(pos)
418 break
419 else:
420 have_last = 1
421 if children[after - 1].nodeType == xml.dom.core.TEXT:
422 # we may need to split off trailing white space:
423 child = children[after - 1]
424 data = child.data
425 if string.rstrip(data) != data:
426 have_last = 0
427 child.splitText(len(string.rstrip(data)))
428 children = parent.childNodes
429 para = doc.createElement("para")
430 prev = None
431 indexes = range(start, after)
432 indexes.reverse()
433 for j in indexes:
434 node = children[j]
435 parent.removeChild(node)
436 para.insertBefore(node, prev)
437 prev = node
438 if have_last:
439 parent.appendChild(para)
440 else:
441 parent.insertBefore(para, parent.childNodes[start])
442
443
444def skip_leading_nodes(children, start, i):
445 i = min(i, len(children))
446 while i > start:
447 # skip over leading comments and whitespace:
448 try:
449 child = children[start]
450 except IndexError:
451 sys.stderr.write(
452 "skip_leading_nodes() failed at index %d\n" % start)
453 raise
454 nodeType = child.nodeType
455 if nodeType == xml.dom.core.COMMENT:
456 start = start + 1
457 elif nodeType == xml.dom.core.TEXT:
458 data = child.data
459 shortened = string.lstrip(data)
460 if shortened:
461 if data != shortened:
462 # break into two nodes: whitespace and non-whitespace
463 child.splitText(len(data) - len(shortened))
464 return start + 1, i + 1
465 break
466 # all whitespace, just skip
467 start = start + 1
468 elif nodeType == xml.dom.core.ELEMENT:
469 if child.tagName in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
470 start = start + 1
471 else:
472 break
473 else:
474 break
475 return start, i
Fred Drakefba0ba21998-12-10 05:07:09 +0000476
477
Fred Drake4db5b461998-12-01 19:03:01 +0000478_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
Fred Drakefcc59101999-01-06 22:50:52 +0000479
Fred Drake4db5b461998-12-01 19:03:01 +0000480def write_esis(doc, ofp, knownempty):
481 for node in doc.childNodes:
482 nodeType = node.nodeType
483 if nodeType == xml.dom.core.ELEMENT:
484 gi = node.tagName
485 if knownempty(gi):
486 if node.hasChildNodes():
487 raise ValueError, "declared-empty node has children"
488 ofp.write("e\n")
489 for k, v in node.attributes.items():
490 value = v.value
491 if _token_rx.match(value):
492 dtype = "TOKEN"
493 else:
494 dtype = "CDATA"
495 ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
496 ofp.write("(%s\n" % gi)
497 write_esis(node, ofp, knownempty)
498 ofp.write(")%s\n" % gi)
499 elif nodeType == xml.dom.core.TEXT:
500 ofp.write("-%s\n" % esistools.encode(node.data))
501 else:
502 raise RuntimeError, "unsupported node type: %s" % nodeType
503
504
Fred Drake03204731998-11-23 17:02:03 +0000505def convert(ifp, ofp):
Fred Drake4db5b461998-12-01 19:03:01 +0000506 p = esistools.ExtendedEsisBuilder()
Fred Drake03204731998-11-23 17:02:03 +0000507 p.feed(ifp.read())
508 doc = p.document
Fred Drake1ff6db41998-11-23 23:10:35 +0000509 normalize(doc)
Fred Drake03204731998-11-23 17:02:03 +0000510 handle_args(doc)
Fred Drake03204731998-11-23 17:02:03 +0000511 simplify(doc)
512 handle_labels(doc)
Fred Drake4db5b461998-12-01 19:03:01 +0000513 handle_appendix(doc)
Fred Drake1ff6db41998-11-23 23:10:35 +0000514 fixup_trailing_whitespace(doc, {
515 "abstract": "\n",
516 "title": "",
517 "chapter": "\n\n",
518 "section": "\n\n",
519 "subsection": "\n\n",
520 "subsubsection": "\n\n",
521 "paragraph": "\n\n",
522 "subparagraph": "\n\n",
523 })
Fred Drake03204731998-11-23 17:02:03 +0000524 cleanup_root_text(doc)
Fred Drake1ff6db41998-11-23 23:10:35 +0000525 cleanup_trailing_parens(doc, ["function", "method", "cfunction"])
Fred Drakefba0ba21998-12-10 05:07:09 +0000526 cleanup_synopses(doc)
Fred Drakeaaed9711998-12-10 20:25:30 +0000527 normalize(doc)
528 fixup_paras(doc)
Fred Drake4db5b461998-12-01 19:03:01 +0000529 #
530 d = {}
531 for gi in p.get_empties():
532 d[gi] = gi
533 knownempty = d.has_key
534 #
Fred Drake03204731998-11-23 17:02:03 +0000535 try:
Fred Drake4db5b461998-12-01 19:03:01 +0000536 write_esis(doc, ofp, knownempty)
Fred Drake03204731998-11-23 17:02:03 +0000537 except IOError, (err, msg):
538 # Ignore EPIPE; it just means that whoever we're writing to stopped
539 # reading. The rest of the output would be ignored. All other errors
540 # should still be reported,
541 if err != errno.EPIPE:
542 raise
543
544
545def main():
546 if len(sys.argv) == 1:
547 ifp = sys.stdin
548 ofp = sys.stdout
549 elif len(sys.argv) == 2:
550 ifp = open(sys.argv[1])
551 ofp = sys.stdout
552 elif len(sys.argv) == 3:
553 ifp = open(sys.argv[1])
554 ofp = open(sys.argv[2], "w")
555 else:
556 usage()
557 sys.exit(2)
558 convert(ifp, ofp)
559
560
561if __name__ == "__main__":
562 main()