Blame - Doc/tools/sgmlconv/docfixer.py - platform/external/python/cpython3

blob: 802f3b3e94709dec730cda3b44c7bf09943faf75 [file] [log] [blame]

Fred Drake	0320473	1998-11-23 17:02:03 +0000	[diff] [blame]	1	#! /usr/bin/env python
				2
				3	"""Promote the IDs from <label/> elements to the enclosing section / chapter /
				4	whatever, then remove the <label/> elements. This allows *ML style internal
				5	linking rather than the bogus LaTeX model.
				6
				7	Note that <label/>s in <title> elements are promoted two steps, since the
				8	<title> elements are artificially created from the section parameter, and the
				9	label really refers to the sectioning construct.
				10	"""
				11	__version__ = '$Revision$'
				12
				13
				14	import errno
Fred Drake	4db5b46	1998-12-01 19:03:01 +0000	[diff] [blame]	15	import esistools
				16	import re
Fred Drake	0320473	1998-11-23 17:02:03 +0000	[diff] [blame]	17	import string
				18	import sys
				19	import xml.dom.core
				20	import xml.dom.esis_builder
				21
				22
Fred Drake	fcc5910	1999-01-06 22:50:52 +0000	[diff] [blame]	23	DEBUG_PARA_FIXER = 0
				24
				25
Fred Drake	0320473	1998-11-23 17:02:03 +0000	[diff] [blame]	26	# Workaround to deal with invalid documents (multiple root elements). This
				27	# does not indicate a bug in the DOM implementation.
				28	#
				29	def get_documentElement(self):
				30	docelem = None
				31	for n in self._node.children:
				32	if n.type == xml.dom.core.ELEMENT:
				33	docelem = xml.dom.core.Element(n, self, self)
				34	return docelem
				35
				36	xml.dom.core.Document.get_documentElement = get_documentElement
				37
				38
				39	# Replace get_childNodes for the Document class; without this, children
				40	# accessed from the Document object via .childNodes (no matter how many
				41	# levels of access are used) will be given an ownerDocument of None.
				42	#
				43	def get_childNodes(self):
				44	return xml.dom.core.NodeList(self._node.children, self, self)
				45
				46	xml.dom.core.Document.get_childNodes = get_childNodes
				47
				48
				49	def get_first_element(doc, gi):
				50	for n in doc.childNodes:
				51	if n.nodeType == xml.dom.core.ELEMENT and n.tagName == gi:
				52	return n
				53
				54	def extract_first_element(doc, gi):
				55	node = get_first_element(doc, gi)
				56	if node is not None:
				57	doc.removeChild(node)
				58	return node
				59
				60
				61	def simplify(doc):
				62	# Try to rationalize the document a bit, since these things are simply
				63	# not valid SGML/XML documents as they stand, and need a little work.
				64	documentclass = "document"
				65	inputs = []
				66	node = extract_first_element(doc, "documentclass")
				67	if node is not None:
				68	documentclass = node.getAttribute("classname")
				69	node = extract_first_element(doc, "title")
				70	if node is not None:
				71	inputs.append(node)
				72	# update the name of the root element
				73	node = get_first_element(doc, "document")
				74	if node is not None:
				75	node._node.name = documentclass
				76	while 1:
				77	node = extract_first_element(doc, "input")
				78	if node is None:
				79	break
				80	inputs.append(node)
				81	if inputs:
				82	docelem = doc.documentElement
				83	inputs.reverse()
				84	for node in inputs:
				85	text = doc.createTextNode("\n")
				86	docelem.insertBefore(text, docelem.firstChild)
				87	docelem.insertBefore(node, text)
				88	docelem.insertBefore(doc.createTextNode("\n"), docelem.firstChild)
				89	while doc.firstChild.nodeType == xml.dom.core.TEXT:
				90	doc.removeChild(doc.firstChild)
				91
				92
				93	def cleanup_root_text(doc):
				94	discards = []
				95	skip = 0
				96	for n in doc.childNodes:
				97	prevskip = skip
				98	skip = 0
				99	if n.nodeType == xml.dom.core.TEXT and not prevskip:
				100	discards.append(n)
Fred Drake	4db5b46	1998-12-01 19:03:01 +0000	[diff] [blame]	101	elif n.nodeType == xml.dom.core.ELEMENT and n.tagName == "COMMENT":
Fred Drake	0320473	1998-11-23 17:02:03 +0000	[diff] [blame]	102	skip = 1
				103	for node in discards:
				104	doc.removeChild(node)
				105
				106
				107	def rewrite_desc_entries(doc, argname_gi):
				108	argnodes = doc.getElementsByTagName(argname_gi)
				109	for node in argnodes:
				110	parent = node.parentNode
				111	nodes = []
				112	for n in parent.childNodes:
				113	if n.nodeType != xml.dom.core.ELEMENT or n.tagName != argname_gi:
				114	nodes.append(n)
				115	desc = doc.createElement("description")
				116	for n in nodes:
				117	parent.removeChild(n)
				118	desc.appendChild(n)
				119	if node.childNodes:
				120	# keep the <args>...</args>, newline & indent
				121	parent.insertBefore(doc.createText("\n "), node)
				122	else:
				123	# no arguments, remove the <args/> node
				124	parent.removeChild(node)
				125	parent.appendChild(doc.createText("\n "))
				126	parent.appendChild(desc)
				127	parent.appendChild(doc.createText("\n"))
				128
				129	def handle_args(doc):
				130	rewrite_desc_entries(doc, "args")
				131	rewrite_desc_entries(doc, "constructor-args")
				132
				133
Fred Drake	4db5b46	1998-12-01 19:03:01 +0000	[diff] [blame]	134	def handle_appendix(doc):
				135	# must be called after simplfy() if document is multi-rooted to begin with
				136	docelem = doc.documentElement
				137	toplevel = docelem.tagName == "manual" and "chapter" or "section"
				138	appendices = 0
				139	nodes = []
				140	for node in docelem.childNodes:
				141	if appendices:
				142	nodes.append(node)
				143	elif node.nodeType == xml.dom.core.ELEMENT:
				144	appnodes = node.getElementsByTagName("appendix")
				145	if appnodes:
				146	appendices = 1
				147	parent = appnodes[0].parentNode
				148	parent.removeChild(appnodes[0])
				149	parent.normalize()
				150	if nodes:
				151	map(docelem.removeChild, nodes)
				152	docelem.appendChild(doc.createTextNode("\n\n\n"))
				153	back = doc.createElement("back-matter")
				154	docelem.appendChild(back)
				155	back.appendChild(doc.createTextNode("\n"))
				156	while nodes and nodes[0].nodeType == xml.dom.core.TEXT \
				157	and not string.strip(nodes[0].data):
				158	del nodes[0]
				159	map(back.appendChild, nodes)
				160	docelem.appendChild(doc.createTextNode("\n"))
Fred Drake	0320473	1998-11-23 17:02:03 +0000	[diff] [blame]	161
				162
				163	def handle_labels(doc):
				164	labels = doc.getElementsByTagName("label")
				165	for label in labels:
				166	id = label.getAttribute("id")
				167	if not id:
				168	continue
				169	parent = label.parentNode
				170	if parent.tagName == "title":
				171	parent.parentNode.setAttribute("id", id)
				172	else:
				173	parent.setAttribute("id", id)
				174	# now, remove <label id="..."/> from parent:
				175	parent.removeChild(label)
				176
				177
Fred Drake	1ff6db4	1998-11-23 23:10:35 +0000	[diff] [blame]	178	def fixup_trailing_whitespace(doc, wsmap):
				179	queue = [doc]
				180	while queue:
				181	node = queue[0]
				182	del queue[0]
				183	if node.nodeType == xml.dom.core.ELEMENT \
				184	and wsmap.has_key(node.tagName):
				185	ws = wsmap[node.tagName]
				186	children = node.childNodes
				187	children.reverse()
				188	if children[0].nodeType == xml.dom.core.TEXT:
				189	data = string.rstrip(children[0].data) + ws
				190	children[0].data = data
				191	children.reverse()
				192	# hack to get the title in place:
				193	if node.tagName == "title" \
				194	and node.parentNode.firstChild.nodeType == xml.dom.core.ELEMENT:
				195	node.parentNode.insertBefore(doc.createText("\n "),
				196	node.parentNode.firstChild)
				197	for child in node.childNodes:
				198	if child.nodeType == xml.dom.core.ELEMENT:
				199	queue.append(child)
				200
				201
				202	def normalize(doc):
				203	for node in doc.childNodes:
				204	if node.nodeType == xml.dom.core.ELEMENT:
				205	node.normalize()
				206
				207
				208	def cleanup_trailing_parens(doc, element_names):
				209	d = {}
				210	for gi in element_names:
				211	d[gi] = gi
				212	rewrite_element = d.has_key
				213	queue = []
				214	for node in doc.childNodes:
				215	if node.nodeType == xml.dom.core.ELEMENT:
				216	queue.append(node)
				217	while queue:
				218	node = queue[0]
				219	del queue[0]
				220	if rewrite_element(node.tagName):
				221	children = node.childNodes
				222	if len(children) == 1 \
				223	and children[0].nodeType == xml.dom.core.TEXT:
				224	data = children[0].data
				225	if data[-2:] == "()":
				226	children[0].data = data[:-2]
				227	else:
				228	for child in node.childNodes:
				229	if child.nodeType == xml.dom.core.ELEMENT:
				230	queue.append(child)
				231
				232
Fred Drake	aaed971	1998-12-10 20:25:30 +0000	[diff] [blame]	233	def contents_match(left, right):
				234	left_children = left.childNodes
				235	right_children = right.childNodes
				236	if len(left_children) != len(right_children):
				237	return 0
				238	for l, r in map(None, left_children, right_children):
				239	nodeType = l.nodeType
				240	if nodeType != r.nodeType:
				241	return 0
				242	if nodeType == xml.dom.core.ELEMENT:
				243	if l.tagName != r.tagName:
				244	return 0
				245	# should check attributes, but that's not a problem here
				246	if not contents_match(l, r):
				247	return 0
				248	elif nodeType == xml.dom.core.TEXT:
				249	if l.data != r.data:
				250	return 0
				251	else:
				252	# not quite right, but good enough
				253	return 0
				254	return 1
				255
				256
				257	def create_module_info(doc, section):
				258	# Heavy.
				259	node = extract_first_element(section, "modulesynopsis")
				260	if node is None:
				261	return
				262	node._node.name = "synopsis"
				263	lastchild = node.childNodes[-1]
				264	if lastchild.nodeType == xml.dom.core.TEXT \
				265	and lastchild.data[-1:] == ".":
				266	lastchild.data = lastchild.data[:-1]
				267	if section.tagName == "section":
				268	modinfo_pos = 2
				269	modinfo = doc.createElement("moduleinfo")
				270	moddecl = extract_first_element(section, "declaremodule")
				271	name = None
				272	if moddecl:
				273	modinfo.appendChild(doc.createTextNode("\n "))
				274	name = moddecl.attributes["name"].value
				275	namenode = doc.createElement("name")
				276	namenode.appendChild(doc.createTextNode(name))
				277	modinfo.appendChild(namenode)
				278	type = moddecl.attributes.get("type")
				279	if type:
				280	type = type.value
				281	modinfo.appendChild(doc.createTextNode("\n "))
				282	typenode = doc.createElement("type")
				283	typenode.appendChild(doc.createTextNode(type))
				284	modinfo.appendChild(typenode)
				285	title = get_first_element(section, "title")
				286	if title:
				287	children = title.childNodes
				288	if len(children) >= 2 \
				289	and children[0].nodeType == xml.dom.core.ELEMENT \
				290	and children[0].tagName == "module" \
				291	and children[0].childNodes[0].data == name:
				292	# this is it; morph the <title> into <short-synopsis>
				293	first_data = children[1]
				294	if first_data.data[:4] == " ---":
				295	first_data.data = string.lstrip(first_data.data[4:])
				296	title._node.name = "short-synopsis"
				297	if children[-1].data[-1:] == ".":
				298	children[-1].data = children[-1].data[:-1]
				299	section.removeChild(title)
				300	section.removeChild(section.childNodes[0])
				301	title.removeChild(children[0])
				302	modinfo_pos = 0
				303	else:
				304	sys.stderr.write(
				305	"module name in title doesn't match"
				306	" <declaremodule>; no <short-synopsis>\n")
				307	else:
				308	sys.stderr.write(
				309	"Unexpected condition: <section> without <title>\n")
				310	modinfo.appendChild(doc.createTextNode("\n "))
				311	modinfo.appendChild(node)
				312	if title and not contents_match(title, node):
				313	# The short synopsis is actually different,
				314	# and needs to be stored:
				315	modinfo.appendChild(doc.createTextNode("\n "))
				316	modinfo.appendChild(title)
				317	modinfo.appendChild(doc.createTextNode("\n "))
				318	section.insertBefore(modinfo, section.childNodes[modinfo_pos])
				319	section.insertBefore(doc.createTextNode("\n "), modinfo)
				320
				321
Fred Drake	fba0ba2	1998-12-10 05:07:09 +0000	[diff] [blame]	322	def cleanup_synopses(doc):
Fred Drake	aaed971	1998-12-10 20:25:30 +0000	[diff] [blame]	323	for node in doc.childNodes:
				324	if node.nodeType == xml.dom.core.ELEMENT \
				325	and node.tagName == "section":
				326	create_module_info(doc, node)
				327
				328
Fred Drake	fcc5910	1999-01-06 22:50:52 +0000	[diff] [blame]	329	FIXUP_PARA_ELEMENTS = (
				330	"chapter",
				331	"section", "subsection", "subsubsection",
				332	"paragraph", "subparagraph")
				333
				334	PARA_LEVEL_ELEMENTS = (
				335	"moduleinfo", "title", "opcodedesc",
				336	"verbatim", "funcdesc", "methoddesc", "excdesc", "datadesc",
				337	"funcdescni", "methoddescni", "excdescni", "datadescni",
				338	"tableii", "tableiii", "tableiv", "localmoduletable",
				339	"sectionauthor",
				340	# include <para>, so we can just do it again to get subsequent paras:
				341	"para",
				342	)
				343
				344	PARA_LEVEL_PRECEEDERS = (
				345	"index", "indexii", "indexiii", "indexiv",
				346	"stindex", "obindex", "COMMENT", "label",
				347	)
				348
Fred Drake	aaed971	1998-12-10 20:25:30 +0000	[diff] [blame]	349	def fixup_paras(doc):
Fred Drake	fcc5910	1999-01-06 22:50:52 +0000	[diff] [blame]	350	for child in doc.childNodes:
				351	if child.nodeType == xml.dom.core.ELEMENT \
				352	and child.tagName in FIXUP_PARA_ELEMENTS:
				353	fixup_paras_helper(doc, child)
				354	descriptions = child.getElementsByTagName("description")
				355	for description in descriptions:
				356	if DEBUG_PARA_FIXER:
				357	sys.stderr.write("-- Fixing up <description> element...\n")
				358	fixup_paras_helper(doc, description)
				359
				360
				361	def fixup_paras_helper(doc, container):
				362	# document is already normalized
				363	children = container.childNodes
				364	start = 0
				365	start_fixed = 0
				366	i = 0
				367	SKIP_ELEMENTS = PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS
				368	for child in children:
				369	if child.nodeType == xml.dom.core.ELEMENT:
				370	if child.tagName in FIXUP_PARA_ELEMENTS:
				371	fixup_paras_helper(doc, child)
				372	break
				373	elif child.tagName in SKIP_ELEMENTS:
				374	if not start_fixed:
				375	start = i + 1
				376	elif not start_fixed:
				377	start_fixed = 1
				378	i = i + 1
				379	else:
				380	if child.nodeType == xml.dom.core.TEXT \
				381	and string.strip(child.data) and not start_fixed:
				382	start_fixed = 1
				383	i = i + 1
				384	if DEBUG_PARA_FIXER:
				385	sys.stderr.write("fixup_paras_helper() called on <%s>; %d, %d\n"
				386	% (container.tagName, start, i))
				387	if i > start:
				388	# the first [start:i] children shoudl be rewritten as <para> elements
				389	# start by breaking text nodes that contain \n\n+ into multiple nodes
				390	nstart, i = skip_leading_nodes(container.childNodes, start, i)
				391	if i > nstart:
				392	build_para(doc, container, nstart, i)
				393	fixup_paras_helper(doc, container)
				394
				395
				396	def build_para(doc, parent, start, i):
				397	children = parent.childNodes
				398	# collect all children until \n\n+ is found in a text node or a
				399	# PARA_LEVEL_ELEMENT is found.
				400	after = start + 1
				401	have_last = 0
				402	BREAK_ELEMENTS = PARA_LEVEL_ELEMENTS + FIXUP_PARA_ELEMENTS
				403	for j in range(start, i):
				404	after = j + 1
				405	child = children[j]
				406	nodeType = child.nodeType
				407	if nodeType == xml.dom.core.ELEMENT:
				408	if child.tagName in BREAK_ELEMENTS:
				409	after = j
				410	break
				411	elif nodeType == xml.dom.core.TEXT:
				412	pos = string.find(child.data, "\n\n")
				413	if pos == 0:
				414	after = j
				415	break
				416	if pos >= 1:
				417	child.splitText(pos)
				418	break
				419	else:
				420	have_last = 1
				421	if children[after - 1].nodeType == xml.dom.core.TEXT:
				422	# we may need to split off trailing white space:
				423	child = children[after - 1]
				424	data = child.data
				425	if string.rstrip(data) != data:
				426	have_last = 0
				427	child.splitText(len(string.rstrip(data)))
				428	children = parent.childNodes
				429	para = doc.createElement("para")
				430	prev = None
				431	indexes = range(start, after)
				432	indexes.reverse()
				433	for j in indexes:
				434	node = children[j]
				435	parent.removeChild(node)
				436	para.insertBefore(node, prev)
				437	prev = node
				438	if have_last:
				439	parent.appendChild(para)
				440	else:
				441	parent.insertBefore(para, parent.childNodes[start])
				442
				443
				444	def skip_leading_nodes(children, start, i):
				445	i = min(i, len(children))
				446	while i > start:
				447	# skip over leading comments and whitespace:
				448	try:
				449	child = children[start]
				450	except IndexError:
				451	sys.stderr.write(
				452	"skip_leading_nodes() failed at index %d\n" % start)
				453	raise
				454	nodeType = child.nodeType
				455	if nodeType == xml.dom.core.COMMENT:
				456	start = start + 1
				457	elif nodeType == xml.dom.core.TEXT:
				458	data = child.data
				459	shortened = string.lstrip(data)
				460	if shortened:
				461	if data != shortened:
				462	# break into two nodes: whitespace and non-whitespace
				463	child.splitText(len(data) - len(shortened))
				464	return start + 1, i + 1
				465	break
				466	# all whitespace, just skip
				467	start = start + 1
				468	elif nodeType == xml.dom.core.ELEMENT:
				469	if child.tagName in PARA_LEVEL_ELEMENTS + PARA_LEVEL_PRECEEDERS:
				470	start = start + 1
				471	else:
				472	break
				473	else:
				474	break
				475	return start, i
Fred Drake	fba0ba2	1998-12-10 05:07:09 +0000	[diff] [blame]	476
				477
Fred Drake	4db5b46	1998-12-01 19:03:01 +0000	[diff] [blame]	478	_token_rx = re.compile(r"[a-zA-Z][a-zA-Z0-9.-]*$")
Fred Drake	fcc5910	1999-01-06 22:50:52 +0000	[diff] [blame]	479
Fred Drake	4db5b46	1998-12-01 19:03:01 +0000	[diff] [blame]	480	def write_esis(doc, ofp, knownempty):
				481	for node in doc.childNodes:
				482	nodeType = node.nodeType
				483	if nodeType == xml.dom.core.ELEMENT:
				484	gi = node.tagName
				485	if knownempty(gi):
				486	if node.hasChildNodes():
				487	raise ValueError, "declared-empty node has children"
				488	ofp.write("e\n")
				489	for k, v in node.attributes.items():
				490	value = v.value
				491	if _token_rx.match(value):
				492	dtype = "TOKEN"
				493	else:
				494	dtype = "CDATA"
				495	ofp.write("A%s %s %s\n" % (k, dtype, esistools.encode(value)))
				496	ofp.write("(%s\n" % gi)
				497	write_esis(node, ofp, knownempty)
				498	ofp.write(")%s\n" % gi)
				499	elif nodeType == xml.dom.core.TEXT:
				500	ofp.write("-%s\n" % esistools.encode(node.data))
				501	else:
				502	raise RuntimeError, "unsupported node type: %s" % nodeType
				503
				504
Fred Drake	0320473	1998-11-23 17:02:03 +0000	[diff] [blame]	505	def convert(ifp, ofp):
Fred Drake	4db5b46	1998-12-01 19:03:01 +0000	[diff] [blame]	506	p = esistools.ExtendedEsisBuilder()
Fred Drake	0320473	1998-11-23 17:02:03 +0000	[diff] [blame]	507	p.feed(ifp.read())
				508	doc = p.document
Fred Drake	1ff6db4	1998-11-23 23:10:35 +0000	[diff] [blame]	509	normalize(doc)
Fred Drake	0320473	1998-11-23 17:02:03 +0000	[diff] [blame]	510	handle_args(doc)
Fred Drake	0320473	1998-11-23 17:02:03 +0000	[diff] [blame]	511	simplify(doc)
				512	handle_labels(doc)
Fred Drake	4db5b46	1998-12-01 19:03:01 +0000	[diff] [blame]	513	handle_appendix(doc)
Fred Drake	1ff6db4	1998-11-23 23:10:35 +0000	[diff] [blame]	514	fixup_trailing_whitespace(doc, {
				515	"abstract": "\n",
				516	"title": "",
				517	"chapter": "\n\n",
				518	"section": "\n\n",
				519	"subsection": "\n\n",
				520	"subsubsection": "\n\n",
				521	"paragraph": "\n\n",
				522	"subparagraph": "\n\n",
				523	})
Fred Drake	0320473	1998-11-23 17:02:03 +0000	[diff] [blame]	524	cleanup_root_text(doc)
Fred Drake	1ff6db4	1998-11-23 23:10:35 +0000	[diff] [blame]	525	cleanup_trailing_parens(doc, ["function", "method", "cfunction"])
Fred Drake	fba0ba2	1998-12-10 05:07:09 +0000	[diff] [blame]	526	cleanup_synopses(doc)
Fred Drake	aaed971	1998-12-10 20:25:30 +0000	[diff] [blame]	527	normalize(doc)
				528	fixup_paras(doc)
Fred Drake	4db5b46	1998-12-01 19:03:01 +0000	[diff] [blame]	529	#
				530	d = {}
				531	for gi in p.get_empties():
				532	d[gi] = gi
				533	knownempty = d.has_key
				534	#
Fred Drake	0320473	1998-11-23 17:02:03 +0000	[diff] [blame]	535	try:
Fred Drake	4db5b46	1998-12-01 19:03:01 +0000	[diff] [blame]	536	write_esis(doc, ofp, knownempty)
Fred Drake	0320473	1998-11-23 17:02:03 +0000	[diff] [blame]	537	except IOError, (err, msg):
				538	# Ignore EPIPE; it just means that whoever we're writing to stopped
				539	# reading. The rest of the output would be ignored. All other errors
				540	# should still be reported,
				541	if err != errno.EPIPE:
				542	raise
				543
				544
				545	def main():
				546	if len(sys.argv) == 1:
				547	ifp = sys.stdin
				548	ofp = sys.stdout
				549	elif len(sys.argv) == 2:
				550	ifp = open(sys.argv[1])
				551	ofp = sys.stdout
				552	elif len(sys.argv) == 3:
				553	ifp = open(sys.argv[1])
				554	ofp = open(sys.argv[2], "w")
				555	else:
				556	usage()
				557	sys.exit(2)
				558	convert(ifp, ofp)
				559
				560
				561	if __name__ == "__main__":
				562	main()