Blame - Doc/tools/html2texi.pl - platform/external/python/cpython2

blob: 740a7e401f703aee6b7d4476a2883abfedc7c025 [file] [log] [blame]

Fred Drake	3fe1d32	1999-01-08 15:25:29 +0000	[diff] [blame]	1	#! /usr/bin/env perl -w
				2	# html2texi.pl -- Convert HTML documentation to Texinfo format
				3	# Michael Ernst <mernst@cs.washington.edu>
				4	# Time-stamp: <1998-09-10 12:52:38 mernst>
				5
				6	# This program converts HTML documentation trees into Texinfo format.
				7	# Given the name of a main (or contents) HTML file, it processes that file,
				8	# and other files (transitively) referenced by it, into a Texinfo file
				9	# (whose name is chosen from the file or directory name of the argument).
				10	# For instance:
				11	# html2texi.pl api/index.pl
				12	# produces file "api.texi".
				13
				14	# Texinfo format can be easily converted to Info format (for browsing in
				15	# Emacs or the standalone Info browser), to a printed manual, or to HTML.
				16	# Thus, html2texi.pl permits conversion of HTML files to Info format, and
				17	# secondarily enables producing printed versions of Web page hierarchies.
				18
				19	# Unlike HTML, Info format is searchable. Since Info is integrated into
				20	# Emacs, one can read documentation without starting a separate Web
				21	# browser. Additionally, Info browsers (including Emacs) contain
				22	# convenient features missing from Web browsers, such as easy index lookup
				23	# and mouse-free browsing.
				24
				25	# Limitations:
				26	# html2texi.pl is currently tuned to latex2html output, but should be
				27	# extensible to arbitrary HTML documents. It will be most useful for HTML
				28	# with a hierarchical structure and an index. The HTML tree to be
				29	# traversed must be on local disk, rather than being accessed via HTTP.
				30	# This script requires the use of "checkargs.pm". To eliminate that
				31	# dependence, replace calls to check_args* by @_ (which is always the last
				32	# argument to those functions).
				33	# Also see the "to do" section, below.
				34	# Comments, suggestions, bug fixes, and enhancements are welcome.
				35
				36	###
				37	### Typical usage for the Python documentation:
				38	###
				39
				40	# (Actually, most of this is in a Makefile instead.)
				41	# The resulting Info format Python documentation is currently available at
				42	# ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz
				43
				44	# Fix up HTML problems, eg <DL COMPACT><DD>
				45
				46	# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html
				47	# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html
				48	# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html
				49	# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html
				50	# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html
				51	# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html
				52
				53	# Edit the generated .texi files:
				54	# * change @setfilename to prefix "python-"
				55	# * fix up any sectioning, such as for Abstract
				56	# * make Texinfo menus
				57	# * perhaps remove the @detailmenu ... @end detailmenu
				58	# In Emacs:
				59	# (progn (goto-char (point-min)) (replace-regexp "\$@setfilename \$\$[-a-z]*\$$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))
				60
				61	# makeinfo api.texi
				62	# makeinfo ext.texi
				63	# makeinfo lib.texi
				64	# makeinfo mac.texi
				65	# makeinfo ref.texi
				66	# makeinfo tut.texi
				67
				68
				69	###
				70	### Structure of the code
				71	###
				72
				73	# To be written...
				74
				75
				76	###
				77	### Design decisions
				78	###
				79
				80	# Source and destination languages
				81	# --------------------------------
				82	#
				83	# The goal is Info files; I create Texinfo, so I don't have to worry about
				84	# the finer details of Info file creation. (I'm not even sure of its exact
				85	# format.)
				86	#
				87	# Why not start from LaTeX rather than HTML?
				88	# I could hack latex2html itself to produce Texinfo instead, or fix up
				89	# partparse.py (which already translates LaTeX to Teinfo).
				90	# Pros:
				91	# * has high-level information such as index entries, original formatting
				92	# Cons:
				93	# * those programs are complicated to read and understand
				94	# * those programs try to handle arbitrary LaTeX input, track catcodes,
				95	# and more: I don't want to go to that effort. HTML isn't as powerful
				96	# as LaTeX, so there are fewer subtleties.
				97	# * the result wouldn't work for arbitrary HTML documents; it would be
				98	# nice to eventually extend this program to HTML produced from Docbook,
				99	# Frame, and more.
				100
				101	# Parsing
				102	# -------
				103	#
				104	# I don't want to view the text as a linear stream; I'd rather parse the
				105	# whole thing and then do pattern matching over the parsed representation (to
				106	# find idioms such as indices, lists of child nodes, etc.).
				107	# * Perl provides HTML::TreeBuilder, which does just what I want.
				108	# * libwww-perl: http://www.linpro.no/lwp/
				109	# * TreeBuilder: HTML-Tree-0.51.tar.gz
				110	# * Python Parsers, Formatters, and Writers don't really provide the right
				111	# interface (and the version in Grail doesn't correspond to another
				112	# distributed version, so I'm confused about which to be using). I could
				113	# write something in Python that creates a parse tree, but why bother?
				114
				115	# Other implementation language issues:
				116	# * Python lacks variable declarations, reasonable scoping, and static
				117	# checking tools. I've written some of the latter for myself that make
				118	# my Perl programming a lot safer than my Python programming will be until
				119	# I have a similar suite for that language.
				120
				121
				122	###########################################################################
				123	### To do
				124	###
				125
				126	# Section names:
				127	# Fix the problem with multiple sections in a single file (eg, Abstract in
				128	# Front Matter section).
				129	# Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310
				130	# Index:
				131	# Perhaps double-check that every tag mentioned in the index is found
				132	# in the text.
				133	# Python: email to python-docs@python.org, to get their feedback.
				134	# Compare to existing lib/ Info manual
				135	# Write the hooks into info-look; replace pyliblookup1-1.tar.gz.
				136	# Postpass to remove extra quotation marks around typography already in
				137	# a different font (to avoid double delimiters as in "`code'"); or
				138	# perhaps consider using only font-based markup so that we don't get
				139	# the extra bold and `code' markup in Info.
				140
				141	## Perhaps don't rely on automatic means for adding up, next, prev; I have
				142	## all that info available to me already, so it's not so much trouble to
				143	## add it. (Right?) But it is so easy to use Emacs instead...
				144
				145
				146	###########################################################################
				147	### Strictures
				148	###
				149
				150	# man HTML::TreeBuilder
				151	# man HTML::Parser
				152	# man HTML::Element
				153
				154	# require HTML::ParserWComment;
				155	require HTML::Parser;
				156	require HTML::TreeBuilder;
				157	require HTML::Element;
				158
				159	use File::Basename;
				160	use Cwd;
				161
				162	use strict;
				163	# use Carp;
				164
				165
				166	use checkargs;
				167
				168
				169	###########################################################################
				170	### Variables
				171	###
				172
				173	my @section_stack = (); # elements are chapter/section/subsec nodetitles (I think)
				174	my $current_ref_tdf; # for the file currently being processed;
				175	# used in error messages
				176	my $html_directory;
				177	my %footnotes;
				178
				179	# First element should not be used.
				180	my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection");
				181
				182	my %inline_markup = ("b" => "strong",
				183	"code" => "code",
				184	"i" => "emph",
				185	"kbd" => "kbd",
				186	"samp" => "samp",
				187	"strong" => "strong",
				188	"tt" => "code",
				189	"var" => "var");
				190
				191	my @deferred_index_entries = ();
				192
				193	my @index_titles = (); # list of (filename, type) lists
				194	my %index_info = ("Index" => ["\@blindex", "bl"],
				195	"Concept Index" => ["\@cindex", "cp"],
				196	"Module Index" => ["\@mdindex", "md"]);
				197
				198
				199	###########################################################################
				200	### Main/contents page
				201	###
				202
				203	# Process first-level page on its own, or just a contents page? Well, I do
				204	# want the title, author, etc., and the front matter... For now, just add
				205	# that by hand at the end.
				206
				207
				208	# data structure possibilities:
				209	# * tree-like (need some kind of stack when processing (or parent pointers))
				210	# * list of name and depth; remember old and new depths.
				211
				212	# Each element is a reference to a list of (nodetitle, depth, filename).
				213	my @contents_list = ();
				214
				215	# The problem with doing fixups on the fly is that some sections may have
				216	# already been processed (and no longer available) by the time we notice
				217	# others with the same name. It's probably better to fully construct the
				218	# contents list (reading in all files of interest) upfront; that will also
				219	# let me do a better job with cross-references, because again, all files
				220	# will already be read in.
				221	my %contents_hash = ();
				222	my %contents_fixups = ();
				223
				224	my @current_contents_list = ();
				225
				226	# Merge @current_contents_list into @contents_list,
				227	# and set @current_contents_list to be empty.
				228	sub merge_contents_lists ( )
				229	{ check_args(0, @_);
				230
				231	# Three possibilities:
				232	# * @contents_list is empty: replace it by @current_contents_list.
				233	# * prefixes of the two lists are identical: do nothing
				234	# * @current_contents_list is all at lower level than $contents_list[0];
				235	# prefix @contents_list by @current_contents_list
				236
				237	if (scalar(@current_contents_list) == 0)
				238	{ die "empty current_contents_list"; }
				239
				240	# if (scalar(@contents_list) == 0)
				241	# { @contents_list = @current_contents_list;
				242	# @current_contents_list = ();
				243	# return; }
				244
				245	# if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1]))
				246	# { unshift @contents_list, @current_contents_list;
				247	# @current_contents_list = ();
				248	# return; }
				249
				250	for (my $i=0; $i<scalar(@current_contents_list); $i++)
				251	{ my $ref_c_tdf = $current_contents_list[$i];
				252	if ($i >= scalar(@contents_list))
				253	{ push @contents_list, $ref_c_tdf;
				254	my $title = $ {$ref_c_tdf}[0];
				255	if (defined $contents_hash{$title})
				256	{ $contents_fixups{$title} = 1; }
				257	else
				258	{ $contents_hash{$title} = 1; }
				259	next; }
				260	my $ref_tdf = $contents_list[$i];
				261	my ($title, $depth, $file) = @{$ref_tdf};
				262	my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf};
				263
				264	if (($title ne $c_title)
				265	&& ($depth < $c_depth)
				266	&& ($file ne $c_file))
				267	{ splice @contents_list, $i, 0, $ref_c_tdf;
				268	if (defined $contents_hash{$c_title})
				269	{ $contents_fixups{$c_title} = 1; }
				270	else
				271	{ $contents_hash{$c_title} = 1; }
				272	next; }
				273
				274	if (($title ne $c_title)
				275	\|\| ($depth != $c_depth)
				276	\|\| ($file ne $c_file))
				277	{ die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:",
				278	"\n main: <<<$title>>> $depth $file",
				279	"\n curr: <<<$c_title>>> $c_depth $c_file"); }
				280	}
				281	@current_contents_list = ();
				282	}
				283
				284
				285
				286	# Set @current_contents_list to a list of (title, href, sectionlevel);
				287	# then merge that list into @contents_list.
				288	# Maybe this function should also produce a map
				289	# from title (or href) to sectionlevel (eg "chapter"?).
				290	sub process_child_links ( $ )
				291	{ my ($he) = check_args(1, @_);
				292
				293	# $he->dump;
				294	if (scalar(@current_contents_list) != 0)
				295	{ die "current_contents_list nonempty: @current_contents_list"; }
				296	$he->traverse(\&increment_current_contents_list, 'ignore text');
				297
				298	# Normalize the depths; for instance, convert 1,3,5 into 0,1,2.
				299	my %depths = ();
				300	for my $ref_tdf (@current_contents_list)
				301	{ $depths{$ {$ref_tdf}[1]} = 1; }
				302	my @sorted_depths = sort keys %depths;
				303	my $current_depth = scalar(@section_stack)-1;
				304	my $current_depth_2 = $ {$current_ref_tdf}[1];
				305	if ($current_depth != $current_depth_2)
				306	{ die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); }
				307	for (my $i=0; $i<scalar(@sorted_depths); $i++)
				308	{ $depths{$sorted_depths[$i]} = $i + $current_depth+1; }
				309	for my $ref_tdf (@current_contents_list)
				310	{ $ {$ref_tdf}[1] = $depths{$ {$ref_tdf}[1]}; }
				311
				312	# Eliminate uninteresting sections. Hard-coded hack for now.
				313	if ($ {$current_contents_list[-1]}[0] eq "About this document ...")
				314	{ pop @current_contents_list; }
				315	if ((scalar(@current_contents_list) > 1)
				316	&& ($ {$current_contents_list[1]}[0] eq "Contents"))
				317	{ my $ref_first_tdf = shift @current_contents_list;
				318	$current_contents_list[0] = $ref_first_tdf; }
				319
				320	for (my $i=0; $i<scalar(@current_contents_list); $i++)
				321	{ my $ref_tdf = $current_contents_list[$i];
				322	my $title = $ {$ref_tdf}[0];
				323	if (exists $index_info{$title})
				324	{ my $index_file = $ {$ref_tdf}[2];
				325	my ($indexing_command, $suffix) = @{$index_info{$title}};
				326	process_index_file($index_file, $indexing_command);
				327	print TEXI "\n\@defindex $suffix\n";
				328	push @index_titles, $title;
				329	splice @current_contents_list, $i, 1;
				330	$i--; }
				331	elsif ($title =~ /\bIndex$/)
				332	{ print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n"; } }
				333
				334	merge_contents_lists();
				335
				336	# print_contents_list();
				337	# print_index_info();
				338	}
				339
				340
				341	sub increment_current_contents_list ( $$$ )
				342	{ my ($he, $startflag, $depth) = check_args(3, @_);
				343	if (!$startflag)
				344	{ return; }
				345
				346	if ($he->tag eq "li")
				347	{ my @li_content = @{$he->content};
				348	if ($li_content[0]->tag ne "a")
				349	{ die "first element of <LI> should be <A>"; }
				350	my ($name, $href, @content) = anchor_info($li_content[0]);
				351	# unused $name
				352	my $title = join("", collect_texts($li_content[0]));
				353	$title = texi_remove_punctuation($title);
				354	# The problem with these is that they are formatted differently in
				355	# @menu and @node!
				356	$title =~ s/``/\"/g;
				357	$title =~ s/''/\"/g;
				358	$title =~ s/ -- / /g;
				359	push @current_contents_list, [ $title, $depth, $href ]; }
				360	return 1;
				361	}
				362
				363	# Simple version for section titles
				364	sub html_to_texi ( $ )
				365	{ my ($he) = check_args(1, @_);
				366	if (!ref $he)
				367	{ return $he; }
				368
				369	my $tag = $he->tag;
				370	if (exists $inline_markup{$tag})
				371	{ my $result = "\@$inline_markup{$tag}\{";
				372	for my $elt (@{$he->content})
				373	{ $result .= html_to_texi($elt); }
				374	$result .= "\}";
				375	return $result; }
				376	else
				377	{ $he->dump;
				378	die "html_to_texi confused by <$tag>"; }
				379	}
				380
				381
				382
				383	sub print_contents_list ()
				384	{ check_args(0, @_);
				385	print STDERR "Contents list:\n";
				386	for my $ref_tdf (@contents_list)
				387	{ my ($title, $depth, $file) = @{$ref_tdf};
				388	print STDERR "$title $depth $file\n"; }
				389	}
				390
				391
				392
				393	###########################################################################
				394	### Index
				395	###
				396
				397	my $l2h_broken_link_name = "l2h-";
				398
				399
				400	# map from file to (map from anchor name to (list of index texts))
				401	# (The list is needed when a single LaTeX command like \envvar
				402	# expands to multiple \index commands.)
				403	my %file_index_entries = ();
				404	my %this_index_entries; # map from anchor name to (list of index texts)
				405
				406	my %file_index_entries_broken = (); # map from file to (list of index texts)
				407	my @this_index_entries_broken;
				408
				409	my $index_prefix = "";
				410	my @index_prefixes = ();
				411
				412	my $this_indexing_command;
				413
				414	sub print_index_info ()
				415	{ check_args(0, @_);
				416	my ($key, $val);
				417	for my $file (sort keys %file_index_entries)
				418	{ my %index_entries = %{$file_index_entries{$file}};
				419	print STDERR "file: $file\n";
				420	for my $aname (sort keys %index_entries)
				421	{ my @entries = @{$index_entries{$aname}};
				422	if (scalar(@entries) == 1)
				423	{ print STDERR " $aname : $entries[0]\n"; }
				424	else
				425	{ print STDERR " $aname : ", join("\n " . (" " x length($aname)), @entries), "\n"; } } }
				426	for my $file (sort keys %file_index_entries_broken)
				427	{ my @entries = @{$file_index_entries_broken{$file}};
				428	print STDERR "file: $file\n";
				429	for my $entry (@entries)
				430	{ print STDERR " $entry\n"; }
				431	}
				432	}
				433
				434
				435	sub process_index_file ( $$ )
				436	{ my ($file, $indexing_command) = check_args(2, @_);
				437	# print "process_index_file $file $indexing_command\n";
				438
				439	my $he = file_to_tree($html_directory . $file);
				440	# $he->dump();
				441
				442	$this_indexing_command = $indexing_command;
				443	$he->traverse(\&process_if_index_dl_compact, 'ignore text');
				444	undef $this_indexing_command;
				445	# print "process_index_file done\n";
				446	}
				447
				448
				449	sub process_if_index_dl_compact ( $$$ )
				450	{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
				451	if (!$startflag)
				452	{ return; }
				453
				454	if (($he->tag() eq "dl") && (defined $he->attr('compact')))
				455	{ process_index_dl_compact($he);
				456	return 0; }
				457	else
				458	{ return 1; }
				459	}
				460
				461
				462	# The elements of a <DL COMPACT> list from a LaTeX2HTML index:
				463	# * a single space: text to be ignored
				464	# * <DT> elements with an optional <DD> element following each one
				465	# Two types of <DT> elements:
				466	# * Followed by a <DD> element: the <DT> contains a single
				467	# string, and the <DD> contains a whitespace string to be ignored, a
				468	# <DL COMPACT> to be recursively processed (with the <DT> string as a
				469	# prefix), and a whitespace string to be ignored.
				470	# * Not followed by a <DD> element: contains a list of anchors
				471	# and texts (ignore the texts, which are only whitespace and commas).
				472	# Optionally contains a <DL COMPACT> to be recursively processed (with
				473	# the <DT> string as a prefix)
				474	sub process_index_dl_compact ( $ )
				475	{ my ($h) = check_args(1, @_);
				476	my @content = @{$h->content()};
				477	for (my $i = 0; $i < scalar(@content); $i++)
				478	{ my $this_he = $content[$i];
				479	if ($this_he->tag ne "dt")
				480	{ $this_he->dump;
				481	die "Expected <DT> tag: " . $this_he->tag; }
				482	if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd"))
				483	{ process_index_dt_and_dd($this_he, $content[$i+1]);
				484	$i++; }
				485	else
				486	{ process_index_lone_dt($this_he); } } }
				487
				488
				489
				490	# Argument is a <DT> element. If it contains more than one anchor, then
				491	# the texts of all subsequent ones are "[Link]". Example:
				492	# <DT>
				493	# <A HREF="embedding.html#l2h-201">
				494	# "$PATH"
				495	# ", "
				496	# <A HREF="embedding.html#l2h-205">
				497	# "[Link]"
				498	# Optionally contains a <DL COMPACT> as well. Example:
				499	# <DT>
				500	# <A HREF="types.html#l2h-616">
				501	# "attribute"
				502	# <DL COMPACT>
				503	# <DT>
				504	# <A HREF="assignment.html#l2h-3074">
				505	# "assignment"
				506	# ", "
				507	# <A HREF="assignment.html#l2h-3099">
				508	# "[Link]"
				509	# <DT>
				510	# <A HREF="types.html#l2h-">
				511	# "assignment, class"
				512
				513	sub process_index_lone_dt ( $ )
				514	{ my ($dt) = check_args(1, @_);
				515	my @dtcontent = @{$dt->content()};
				516	my $acontent;
				517	my $acontent_suffix;
				518	for my $a (@dtcontent)
				519	{ if ($a eq ", ")
				520	{ next; }
				521	if (!ref $a)
				522	{ $dt->dump;
				523	die "Unexpected <DT> string element: $a"; }
				524
				525	if ($a->tag eq "dl")
				526	{ push @index_prefixes, $index_prefix;
				527	if (!defined $acontent_suffix)
				528	{ die "acontent_suffix not yet defined"; }
				529	$index_prefix .= $acontent_suffix . ", ";
				530	process_index_dl_compact($a);
				531	$index_prefix = pop(@index_prefixes);
				532	return; }
				533
				534	if ($a->tag ne "a")
				535	{ $dt->dump;
				536	$a->dump;
				537	die "Expected anchor in lone <DT>"; }
				538
				539	my ($aname, $ahref, @acontent) = anchor_info($a);
				540	# unused $aname
				541	if (scalar(@acontent) != 1)
				542	{ die "Expected just one content of <A> in <DT>: @acontent"; }
				543	if (ref $acontent[0])
				544	{ $acontent[0]->dump;
				545	die "Expected string content of <A> in <DT>: $acontent[0]"; }
				546	if (!defined($acontent))
				547	{ $acontent = $index_prefix . $acontent[0];
				548	$acontent_suffix = $acontent[0]; }
				549	elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0])))
				550	{ die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; }
				551
				552	if (!defined $ahref)
				553	{ $dt->dump;
				554	die "no HREF in nachor in <DT>"; }
				555	my ($ahref_file, $ahref_name) = split(/\#/, $ahref);
				556	if (!defined $ahref_name)
				557	{ # Reference to entire file
				558	$ahref_name = ""; }
				559
				560	if ($ahref_name eq $l2h_broken_link_name)
				561	{ if (!exists $file_index_entries_broken{$ahref_file})
				562	{ $file_index_entries_broken{$ahref_file} = []; }
				563	push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent";
				564	next; }
				565
				566	if (!exists $file_index_entries{$ahref_file})
				567	{ $file_index_entries{$ahref_file} = {}; }
				568	# Don't do this! It appears to make a copy, which is not desired.
				569	# my %index_entries = %{$file_index_entries{$ahref_file}};
				570	if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name})
				571	{ $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; }
				572	# { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name};
				573	# if ($acontent eq $oldcontent)
				574	# { die "Multiple identical index entries?"; }
				575	# die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; }
				576
				577	push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent";
				578	# print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n";
				579	}
				580	}
				581
				582	sub process_index_dt_and_dd ( $$ )
				583	{ my ($dt, $dd) = check_args(2, @_);
				584	my $dtcontent;
				585	{ my @dtcontent = @{$dt->content()};
				586	if ((scalar(@dtcontent) != 1) \|\| (ref $dtcontent[0]))
				587	{ $dd->dump;
				588	$dt->dump;
				589	die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of <DT>: @dtcontent"; }
				590	$dtcontent = $dtcontent[0];
				591	$dtcontent =~ s/ +$//; }
				592	my $ddcontent;
				593	{ my @ddcontent = @{$dd->content()};
				594	if (scalar(@ddcontent) != 1)
				595	{ die "Expected single <DD> content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; }
				596	$ddcontent = $ddcontent[0]; }
				597	if ($ddcontent->tag ne "dl")
				598	{ die "Expected <DL> as content of <DD>, but saw: $ddcontent"; }
				599
				600	push @index_prefixes, $index_prefix;
				601	$index_prefix .= $dtcontent . ", ";
				602	process_index_dl_compact($ddcontent);
				603	$index_prefix = pop(@index_prefixes);
				604	}
				605
				606
				607	###########################################################################
				608	### Ordinary sections
				609	###
				610
				611	sub process_section_file ( $$$ )
				612	{ my ($file, $depth, $nodetitle) = check_args(3, @_);
				613	my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file);
				614
				615	# print STDERR "process_section_file: $file $depth $nodetitle\n";
				616
				617	# Equivalently:
				618	# while ($depth >= scalar(@section_stack)) { pop(@section_stack); }
				619	@section_stack = @section_stack[0..$depth-1];
				620
				621	# Not a great nodename fixup scheme; need a more global view
				622	if ((defined $contents_fixups{$nodetitle})
				623	&& (scalar(@section_stack) > 0))
				624	{ my $up_title = $section_stack[$#section_stack];
				625	# hack for Python Standard Library
				626	$up_title =~ s/^(Built-in\|Standard) Module //g;
				627	my ($up_first_word) = split(/ /, $up_title);
				628	$nodetitle = "$up_first_word $nodetitle";
				629	}
				630
				631	push @section_stack, $nodetitle;
				632	# print STDERR "new section_stack: ", join(", ", @section_stack), "\n";
				633
				634	$he->traverse(\&process_if_child_links, 'ignore text');
				635	%footnotes = ();
				636	# $he->dump;
				637	$he->traverse(\&process_if_footnotes, 'ignore text');
				638
				639	# $he->dump;
				640
				641	if (exists $file_index_entries{$file})
				642	{ %this_index_entries = %{$file_index_entries{$file}};
				643	# print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n";
				644	}
				645	else
				646	{ # print STDERR "Warning: no index entries for file $file\n";
				647	%this_index_entries = (); }
				648
				649	if (exists $file_index_entries_broken{$file})
				650	{ @this_index_entries_broken = @{$file_index_entries_broken{$file}}; }
				651	else
				652	{ # print STDERR "Warning: no index entries for file $file\n";
				653	@this_index_entries_broken = (); }
				654
				655
				656	if ($he->tag() ne "html")
				657	{ die "Expected <HTML> at top level"; }
				658	my @content = @{$he->content()};
				659	if ((!ref $content[0]) or ($content[0]->tag ne "head"))
				660	{ $he->dump;
				661	die "<HEAD> not first element of <HTML>"; }
				662	if ((!ref $content[1]) or ($content[1]->tag ne "body"))
				663	{ $he->dump;
				664	die "<BODY> not second element of <HTML>"; }
				665
				666	$content[1]->traverse(\&output_body);
				667	}
				668
				669	# stack of things we're inside that are preventing indexing from occurring now.
				670	# These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?)
				671	my @index_deferrers = ();
				672
				673	sub push_or_pop_index_deferrers ( $$ )
				674	{ my ($tag, $startflag) = check_args(2, @_);
				675	if ($startflag)
				676	{ push @index_deferrers, $tag; }
				677	else
				678	{ my $old_deferrer = pop @index_deferrers;
				679	if ($tag ne $old_deferrer)
				680	{ die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); }
				681	do_deferred_index_entries(); }
				682	}
				683
				684
				685	sub label_add_index_entries ( $;$ )
				686	{ my ($label, $he) = check_args_range(1, 2, @_);
				687	# print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n";
				688	# $he is the anchor element
				689	if (exists $this_index_entries{$label})
				690	{ push @deferred_index_entries, @{$this_index_entries{$label}};
				691	return; }
				692
				693	if ($label eq $l2h_broken_link_name)
				694	{ # Try to find some text to use in guessing which links should point here
				695	# I should probably only look at the previous element, or if that is
				696	# all punctuation, the one before it; collecting all the previous texts
				697	# is a bit of overkill.
				698	my @anchor_texts = collect_texts($he);
				699	my @previous_texts = collect_texts($he->parent, $he);
				700	# 4 elements is arbitrary; ought to filter out punctuation and small words
				701	# first, then perhaps keep fewer. Perhaps also filter out formatting so
				702	# that we can see a larger chunk of text? (Probably not.)
				703	# Also perhaps should do further chunking into words, in case the
				704	# index term isn't a chunk of its own (eg, was in <tt>...</tt>.
				705	my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]);
				706
				707	my $guessed = 0;
				708	for my $text (@candidate_texts)
				709	{ # my $orig_text = $text;
				710	if ($text =~ /^[\"\`\'().?! ]*$/)
				711	{ next; }
				712	if (length($text) <= 2)
				713	{ next; }
				714	# hack for Python manual; maybe defer until failure first time around?
				715	$text =~ s/^sys\.//g;
				716	for my $iterm (@this_index_entries_broken)
				717	{ # I could test for zero: LaTeX2HTML's failures in the Python
				718	# documentation are only for items of the form "... (built-in...)"
				719	if (index($iterm, $text) != -1)
				720	{ push @deferred_index_entries, $iterm;
				721	# print STDERR "Guessing index term `$iterm' for text `$orig_text'\n";
				722	$guessed = 1;
				723	} } }
				724	if (!$guessed)
				725	{ # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n";
				726	}
				727	}
				728	}
				729
				730
				731	# Need to add calls to this at various places.
				732	# Perhaps add HTML::Element argument and do the check for appropriateness
				733	# here (ie, no action if inside <H1>, etc.).
				734	sub do_deferred_index_entries ()
				735	{ check_args(0, @_);
				736	if ((scalar(@deferred_index_entries) > 0)
				737	&& (scalar(@index_deferrers) == 0))
				738	{ print TEXI "\n", join("\n", @deferred_index_entries), "\n";
				739	@deferred_index_entries = (); }
				740	}
				741
				742	my $table_columns; # undefined if not in a table
				743	my $table_first_column; # boolean
				744
				745	sub output_body ( $$$ )
				746	{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
				747
				748	if (!ref $he)
				749	{ my $space_index = index($he, " ");
				750	if ($space_index != -1)
				751	{ # Why does
				752	# print TEXI texi_quote(substr($he, 0, $space_index+1));
				753	# give: Can't locate object method "TEXI" via package "texi_quote"
				754	# (Because the definition texi_quote hasn't been seen yet.)
				755	print TEXI &texi_quote(substr($he, 0, $space_index+1));
				756	do_deferred_index_entries();
				757	print TEXI &texi_quote(substr($he, $space_index+1)); }
				758	else
				759	{ print TEXI &texi_quote($he); }
				760	return; }
				761
				762	my $tag = $he->tag();
				763
				764	# Ordinary text markup first
				765	if (exists $inline_markup{$tag})
				766	{ if ($startflag)
				767	{ print TEXI "\@$inline_markup{$tag}\{"; }
				768	else
				769	{ print TEXI "\}"; } }
				770	elsif ($tag eq "a")
				771	{ my ($name, $href, @content) = anchor_info($he);
				772	if (!$href)
				773	{ # This anchor is only here for indexing/cross referencing purposes.
				774	if ($startflag)
				775	{ label_add_index_entries($name, $he); }
				776	}
				777	elsif ($href =~ "^(ftp\|http\|news):")
				778	{ if ($startflag)
				779	{ # Should avoid second argument if it's identical to the URL.
				780	print TEXI "\@uref\{$href, "; }
				781	else
				782	{ print TEXI "\}"; }
				783	}
				784	elsif ($href =~ /^\#(foot[0-9]+)$/)
				785	{ # Footnote
				786	if ($startflag)
				787	{ # Could double-check name and content, but I'm not
				788	# currently storing that information.
				789	print TEXI "\@footnote\{";
				790	$footnotes{$1}->traverse(\&output_body);
				791	print TEXI "\}";
				792	return 0; } }
				793	else
				794	{ if ($startflag)
				795	{ $he->dump;
				796	warn "Can't deal with internal HREF anchors yet"; }
				797	}
				798	}
Fred Drake	8e2c945	1999-01-11 22:30:34 +0000	[diff] [blame^]	799	elsif ($tag eq "address")
				800	{ # this is part of the page footer, ignore
				801	return 0;
				802	}
Fred Drake	3fe1d32	1999-01-08 15:25:29 +0000	[diff] [blame]	803	elsif ($tag eq "br")
				804	{ print TEXI "\@\n"; }
				805	elsif ($tag eq "body")
				806	{ }
				807	elsif ($tag eq "center")
				808	{ if (has_single_content_string($he)
				809	&& ($ {$he->content}[0] =~ /^ *$/))
				810	{ return 0; }
				811	if ($startflag)
				812	{ print TEXI "\n\@center\n"; }
				813	else
				814	{ print TEXI "\n\@end center\n"; }
				815	}
				816	elsif ($tag eq "div")
				817	{ my $align = $he->attr('align');
				818	if (defined($align) && ($align eq "center"))
				819	{ if (has_single_content_string($he)
				820	&& ($ {$he->content}[0] =~ /^ *$/))
				821	{ return 0; }
				822	if ($startflag)
				823	{ print TEXI "\n\@center\n"; }
				824	else
				825	{ print TEXI "\n\@end center\n"; } }
				826	}
				827	elsif ($tag eq "dl")
				828	{ # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example"
				829	if (has_single_content_with_tag($he, "dd"))
				830	{ my $he_dd = $ {$he->content}[0];
				831	if (has_single_content_with_tag($he_dd, "pre"))
				832	{ my $he_pre = $ {$he_dd->content}[0];
				833	print_pre($he_pre);
				834	return 0; } }
				835	if ($startflag)
				836	{ # Could examine the elements, to be cleverer about formatting.
				837	# (Also to use ftable, vtable...)
				838	print TEXI "\n\@table \@asis\n"; }
				839	else
				840	{ print TEXI "\n\@end table\n"; }
				841	}
				842	elsif ($tag eq "dt")
				843	{ push_or_pop_index_deferrers($tag, $startflag);
				844	if ($startflag)
				845	{ print TEXI "\n\@item "; }
				846	else
				847	{ } }
				848	elsif ($tag eq "dd")
				849	{ if ($startflag)
				850	{ print TEXI "\n"; }
				851	else
				852	{ }
				853	if (scalar(@index_deferrers) != 0)
				854	{ $he->dump;
				855	die "index deferrers: ", join(" ", @index_deferrers); }
				856	do_deferred_index_entries();
				857	}
				858	elsif ($tag =~ /^(font\|big\|small)$/)
				859	{ # Do nothing for now.
				860	}
				861	elsif ($tag =~ /^h[1-6]$/)
				862	{ # We don't need this because we never recursively enter the heading content.
				863	# push_or_pop_index_deferrers($tag, $startflag);
				864	my $secname = "";
				865	my @seclabels = ();
				866	for my $elt (@{$he->content})
				867	{ if (!ref $elt)
				868	{ $secname .= $elt; }
				869	elsif ($elt->tag eq "br")
				870	{ }
				871	elsif ($elt->tag eq "a")
				872	{ my ($name, $href, @acontent) = anchor_info($elt);
				873	if ($href)
				874	{ $he->dump;
				875	$elt->dump;
				876	die "Nonsimple anchor in <$tag>"; }
				877	if (!defined $name)
				878	{ die "No NAME for anchor in $tag"; }
				879	push @seclabels, $name;
				880	for my $subelt (@acontent)
				881	{ $secname .= html_to_texi($subelt); } }
				882	else
				883	{ $secname .= html_to_texi($elt); } }
				884	if ($secname eq "")
				885	{ die "No section name in <$tag>"; }
				886	if (scalar(@section_stack) == 1)
				887	{ if ($section_stack[-1] ne "Top")
				888	{ die "Not top? $section_stack[-1]"; }
				889	print TEXI "\@settitle $secname\n";
				890	print TEXI "\@c %**end of header\n";
				891	print TEXI "\n";
				892	print TEXI "\@node Top\n";
				893	print TEXI "\n"; }
				894	else
				895	{ print TEXI "\n\@node $section_stack[-1]\n";
				896	print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; }
				897	for my $seclabel (@seclabels)
				898	{ label_add_index_entries($seclabel); }
				899	# This should only happen once per file.
				900	label_add_index_entries("");
				901	if (scalar(@index_deferrers) != 0)
				902	{ die "index deferrers: ", join(" ", @index_deferrers); }
				903	do_deferred_index_entries();
				904	return 0;
				905	}
				906	elsif ($tag eq "hr")
				907	{ }
				908	elsif ($tag eq "ignore")
				909	{ # Hack for ignored elements
				910	return 0;
				911	}
				912	elsif ($tag eq "li")
				913	{ if ($startflag)
				914	{ print TEXI "\n\n\@item\n";
				915	do_deferred_index_entries(); } }
				916	elsif ($tag eq "ol")
				917	{ if ($startflag)
				918	{ print TEXI "\n\@enumerate \@bullet\n"; }
				919	else
				920	{ print TEXI "\n\@end enumerate\n"; } }
				921	elsif ($tag eq "p")
				922	{ if ($startflag)
				923	{ print TEXI "\n\n"; }
				924	if (scalar(@index_deferrers) != 0)
				925	{ die "index deferrers: ", join(" ", @index_deferrers); }
				926	do_deferred_index_entries(); }
				927	elsif ($tag eq "pre")
				928	{ print_pre($he);
				929	return 0; }
				930	elsif ($tag eq "table")
				931	{ # Could also indicate common formatting for first column, or
				932	# determine relative widths for columns (or determine a prototype row)
				933	if ($startflag)
				934	{ if (defined $table_columns)
				935	{ $he->dump;
				936	die "Can't deal with table nested inside $table_columns-column table"; }
				937	$table_columns = table_columns($he);
				938	if ($table_columns < 2)
				939	{ $he->dump;
				940	die "Column with $table_columns columns?"; }
				941	elsif ($table_columns == 2)
				942	{ print TEXI "\n\@table \@asis\n"; }
				943	else
				944	{ print TEXI "\n\@multitable \@columnfractions";
				945	for (my $i=0; $i<$table_columns; $i++)
				946	{ print TEXI " ", 1.0/$table_columns; }
				947	print TEXI "\n"; } }
				948	else
				949	{ if ($table_columns == 2)
				950	{ print TEXI "\n\@end table\n"; }
				951	else
				952	{ print TEXI "\n\@end multitable\n"; }
				953	undef $table_columns; } }
				954	elsif (($tag eq "td") \|\| ($tag eq "th"))
				955	{ if ($startflag)
				956	{ if ($table_first_column)
				957	{ print TEXI "\n\@item ";
				958	$table_first_column = 0; }
				959	elsif ($table_columns > 2)
				960	{ print TEXI "\n\@tab "; } }
				961	else
				962	{ print TEXI "\n"; } }
				963	elsif ($tag eq "tr")
				964	{ if ($startflag)
				965	{ $table_first_column = 1; } }
				966	elsif ($tag eq "ul")
				967	{ if ($startflag)
				968	{ print TEXI "\n\@itemize \@bullet\n"; }
				969	else
				970	{ print TEXI "\n\@end itemize\n"; } }
				971	else
				972	{ print STDERR "\nBailing out\n";
				973	$he->dump;
				974	return 0; }
				975
				976	return 1;
				977	}
				978
				979	sub print_pre ( $ )
				980	{ my ($he_pre) = check_args(1, @_);
				981	if (!has_single_content_string($he_pre))
				982	{ die "Multiple or non-string content for <PRE>: ", @{$he_pre->content}; }
				983	my $pre_content = $ {$he_pre->content}[0];
				984	print TEXI "\n\@example";
				985	print TEXI &texi_quote($pre_content);
				986	print TEXI "\@end example\n";
				987	}
				988
				989	sub table_columns ( $ )
				990	{ my ($table) = check_args(1, @_);
				991	my $result = 0;
				992	for my $row (@{$table->content})
				993	{ if ($row->tag ne "tr")
				994	{ $table->dump;
				995	$row->dump;
				996	die "Expected <TR> as table row."; }
				997	$result = max($result, scalar(@{$row->content})); }
				998	return $result;
				999	}
				1000
				1001
				1002	###########################################################################
				1003	### Utilities
				1004	###
				1005
				1006	sub min ( $$ )
				1007	{ my ($x, $y) = check_args(2, @_);
				1008	return ($x < $y) ? $x : $y;
				1009	}
				1010
				1011	sub max ( $$ )
				1012	{ my ($x, $y) = check_args(2, @_);
				1013	return ($x > $y) ? $x : $y;
				1014	}
				1015
				1016	sub file_to_tree ( $ )
				1017	{ my ($file) = check_args(1, @_);
				1018
				1019	my $tree = new HTML::TreeBuilder;
				1020	$tree->ignore_unknown(1);
				1021	# $tree->warn(1);
				1022	$tree->parse_file($file);
				1023	cleanup_parse_tree($tree);
				1024	return $tree
				1025	}
				1026
				1027
				1028	sub has_single_content ( $ )
				1029	{ my ($he) = check_args(1, @_);
				1030	if (!ref $he)
				1031	{ # return 0;
				1032	die "Non-reference argument: $he"; }
				1033	my $ref_content = $he->content;
				1034	if (!defined $ref_content)
				1035	{ return 0; }
				1036	my @content = @{$ref_content};
				1037	if (scalar(@content) != 1)
				1038	{ return 0; }
				1039	return 1;
				1040	}
				1041
				1042
				1043	# Return true if the content of the element contains only one element itself,
				1044	# and that inner element has the specified tag.
				1045	sub has_single_content_with_tag ( $$ )
				1046	{ my ($he, $tag) = check_args(2, @_);
				1047	if (!has_single_content($he))
				1048	{ return 0; }
				1049	my $content = $ {$he->content}[0];
				1050	if (!ref $content)
				1051	{ return 0; }
				1052	my $content_tag = $content->tag;
				1053	if (!defined $content_tag)
				1054	{ return 0; }
				1055	return $content_tag eq $tag;
				1056	}
				1057
				1058	sub has_single_content_string ( $ )
				1059	{ my ($he) = check_args(1, @_);
				1060	if (!has_single_content($he))
				1061	{ return 0; }
				1062	my $content = $ {$he->content}[0];
				1063	if (ref $content)
				1064	{ return 0; }
				1065	return 1;
				1066	}
				1067
				1068
				1069	# Return name, href, content. First two may be undefined; third is an array.
				1070	# I don't see how to determine if there are more attributes.
				1071	sub anchor_info ( $ )
				1072	{ my ($he) = check_args(1, @_);
				1073	if ($he->tag ne "a")
				1074	{ $he->dump;
				1075	die "passed non-anchor to anchor_info"; }
				1076	my $name = $he->attr('name');
				1077	my $href = $he->attr('href');
				1078	my @content = ();
				1079	{ my $ref_content = $he->content;
				1080	if (defined $ref_content)
				1081	{ @content = @{$ref_content}; } }
				1082	return ($name, $href, @content);
				1083	}
				1084
				1085
				1086	sub texi_quote ( $ )
				1087	{ my ($text) = check_args(1, @_);
				1088	$text =~ s/([\@\{\}])/\@$1/g;
				1089	$text =~ s/ -- / --- /g;
				1090	return $text;
				1091	}
				1092
				1093	# Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.
				1094	sub texi_remove_punctuation ( $ )
				1095	{ my ($text) = check_args(1, @_);
				1096
				1097	$text =~ s/^ +//g;
				1098	$text =~ s/[ :]+$//g;
				1099	$text =~ s/^[1-9][0-9.]* +//g;
				1100	$text =~ s/,//g;
				1101	# Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- "
				1102	# gets converted into " - ", just as "---" would be converted into " -- ",
				1103	# so the names end up differing.)
				1104	# $text =~ s/:/ -- /g;
				1105	$text =~ s/://g;
				1106	return $text;
				1107	}
				1108
				1109
				1110	## Do not use this inside `traverse': it throws off the traversal. Use
				1111	## html_replace_by_ignore or html_replace_by_meta instead.
				1112	# Returns 1 if success, 0 if failure.
				1113	sub html_remove ( $;$ )
				1114	{ my ($he, $parent) = check_args_range(1, 2, @_);
				1115	if (!defined $parent)
				1116	{ $parent = $he->parent; }
				1117	my $ref_pcontent = $parent->content;
				1118	my @pcontent = @{$ref_pcontent};
				1119	for (my $i=0; $i<scalar(@pcontent); $i++)
				1120	{ if ($pcontent[$i] eq $he)
				1121	{ splice @{$ref_pcontent}, $i, 1;
				1122	$he->parent(undef);
				1123	return 1; } }
				1124	die "Didn't find $he in $parent";
				1125	}
				1126
				1127
				1128	sub html_replace ( $$;$ )
				1129	{ my ($orig, $new, $parent) = check_args_range(2, 3, @_);
				1130	if (!defined $parent)
				1131	{ $parent = $orig->parent; }
				1132	my $ref_pcontent = $parent->content;
				1133	my @pcontent = @{$ref_pcontent};
				1134	for (my $i=0; $i<scalar(@pcontent); $i++)
				1135	{ if ($pcontent[$i] eq $orig)
				1136	{ $ {$ref_pcontent}[$i] = $new;
				1137	$new->parent($parent);
				1138	$orig->parent(undef);
				1139	return 1; } }
				1140	die "Didn't find $orig in $parent";
				1141	}
				1142
				1143	sub html_replace_by_meta ( $;$ )
				1144	{ my ($orig, $parent) = check_args_range(1, 2, @_);
				1145	my $meta = new HTML::Element "meta";
				1146	if (!defined $parent)
				1147	{ $parent = $orig->parent; }
				1148	return html_replace($orig, $meta, $parent);
				1149	}
				1150
				1151	sub html_replace_by_ignore ( $;$ )
				1152	{ my ($orig, $parent) = check_args_range(1, 2, @_);
				1153	my $ignore = new HTML::Element "ignore";
				1154	if (!defined $parent)
				1155	{ $parent = $orig->parent; }
				1156	return html_replace($orig, $ignore, $parent);
				1157	}
				1158
				1159
				1160
				1161	###
				1162	### Collect text elements
				1163	###
				1164
				1165	my @collected_texts;
				1166	my $collect_texts_stoppoint;
				1167	my $done_collecting;
				1168
				1169	sub collect_texts ( $;$ )
				1170	{ my ($root, $stop) = check_args_range(1, 2, @_);
				1171	# print STDERR "collect_texts: $root $stop\n";
				1172	$collect_texts_stoppoint = $stop;
				1173	$done_collecting = 0;
				1174	@collected_texts = ();
				1175	$root->traverse(\&collect_if_text); # process texts
				1176	# print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";
				1177	return @collected_texts;
				1178	}
				1179
				1180	sub collect_if_text ( $$$ )
				1181	{ my $he = (check_args(3, @_))[0]; # ignore depth and startflag arguments
				1182	if ($done_collecting)
				1183	{ return 0; }
				1184	if (!defined $he)
				1185	{ return 0; }
				1186	if (!ref $he)
				1187	{ push @collected_texts, $he;
				1188	return 0; }
				1189	if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint))
				1190	{ $done_collecting = 1;
				1191	return 0; }
				1192	return 1;
				1193	}
				1194
				1195
				1196	###########################################################################
				1197	### Clean up parse tree
				1198	###
				1199
				1200	sub cleanup_parse_tree ( $ )
				1201	{ my ($he) = check_args(1, @_);
				1202	$he->traverse(\&delete_if_navigation, 'ignore text');
				1203	$he->traverse(\&delete_extra_spaces, 'ignore text');
				1204	$he->traverse(\&merge_dl, 'ignore text');
				1205	return $he;
				1206	}
				1207
				1208
				1209	## Simpler version that deletes contents but not the element itself.
				1210	# sub delete_if_navigation ( $$$ )
				1211	# { my $he = (check_args(3, @_))[0]; # ignore startflag and depth
				1212	# if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))
				1213	# { $he->delete();
				1214	# return 0; }
				1215	# else
				1216	# { return 1; }
				1217	# }
				1218
				1219	sub delete_if_navigation ( $$$ )
				1220	{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
				1221	if (!$startflag)
				1222	{ return; }
				1223
				1224	if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation'))
				1225	{ my $ref_pcontent = $he->parent()->content();
				1226	# Don't try to modify @pcontent, which appears to be a COPY.
				1227	# my @pcontent = @{$ref_pcontent};
				1228	for (my $i = 0; $i<scalar(@{$ref_pcontent}); $i++)
				1229	{ if (${$ref_pcontent}[$i] eq $he)
				1230	{ splice(@{$ref_pcontent}, $i, 1);
				1231	last; } }
				1232	$he->delete();
				1233	return 0; }
				1234	else
				1235	{ return 1; }
				1236	}
				1237
				1238	sub delete_extra_spaces ( $$$ )
				1239	{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
				1240	if (!$startflag)
				1241	{ return; }
				1242
				1243	my $tag = $he->tag;
				1244	if ($tag =~ /^(head\|html\|table\|tr\|ul)$/)
				1245	{ delete_child_spaces($he); }
				1246	delete_trailing_spaces($he);
				1247	return 1;
				1248	}
				1249
				1250
				1251	sub delete_child_spaces ( $ )
				1252	{ my ($he) = check_args(1, @_);
				1253	my $ref_content = $he->content();
				1254	for (my $i = 0; $i<scalar(@{$ref_content}); $i++)
				1255	{ if ($ {$ref_content}[$i] =~ /^ *$/)
				1256	{ splice(@{$ref_content}, $i, 1);
				1257	$i--; } }
				1258	}
				1259
				1260	sub delete_trailing_spaces ( $ )
				1261	{ my ($he) = check_args(1, @_);
				1262	my $ref_content = $he->content();
				1263	if (! defined $ref_content)
				1264	{ return; }
				1265	# Could also check for previous element = /^h[1-6]$/.
				1266	for (my $i = 0; $i<scalar(@{$ref_content})-1; $i++)
				1267	{ if ($ {$ref_content}[$i] =~ /^ *$/)
				1268	{ my $next_elt = $ {$ref_content}[$i+1];
				1269	if ((ref $next_elt) && ($next_elt->tag =~ /^(br\|dd\|dl\|dt\|hr\|p\|ul)$/))
				1270	{ splice(@{$ref_content}, $i, 1);
				1271	$i--; } } }
				1272	if ($he->tag =~ /^(dd\|dt\|^h[1-6]\|li\|p)$/)
				1273	{ my $last_elt = $ {$ref_content}[$#{$ref_content}];
				1274	if ((defined $last_elt) && ($last_elt =~ /^ *$/))
				1275	{ pop @{$ref_content}; } }
				1276	}
				1277
				1278
				1279	# If we find a paragraph that looks like
				1280	# <P>
				1281	# <HR>
				1282	# <UL>
				1283	# then accumulate its links into a contents_list and delete the paragraph.
				1284	sub process_if_child_links ( $$$ )
				1285	{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
				1286	if (!$startflag)
				1287	{ return; }
				1288
				1289	if ($he->tag() eq "p")
				1290	{ my $ref_content = $he->content();
				1291	if (defined $ref_content)
				1292	{ my @content = @{$ref_content};
				1293	if ((scalar(@content) == 2)
				1294	&& (ref $content[0]) && $content[0]->tag() eq "hr"
				1295	&& (ref $content[1]) && $content[1]->tag() eq "ul")
				1296	{ process_child_links($he);
				1297	$he->delete();
				1298	return 0; } } }
				1299	return 1;
				1300	}
				1301
				1302
				1303	# If we find
				1304	# <H4>
				1305	# "Footnotes"
				1306	# <DL>
				1307	# <DT>
				1308	# <A NAME="foot560">
				1309	# "...borrow"
				1310	# <A HREF="refcountsInPython.html#tex2html2" NAME="foot560">
				1311	# "1.2"
				1312	# <DD>
				1313	# "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. "
				1314	# ...
				1315	# then record the footnote information and delete the section and list.
				1316
				1317	my $process_if_footnotes_expect_dl_next = 0;
				1318
				1319	sub process_if_footnotes ( $$$ )
				1320	{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
				1321	if (!$startflag)
				1322	{ return; }
				1323
				1324	if (($he->tag() eq "h4")
				1325	&& has_single_content_string($he)
				1326	&& ($ {$he->content}[0] eq "Footnotes"))
				1327	{ html_replace_by_ignore($he);
				1328	$process_if_footnotes_expect_dl_next = 1;
				1329	return 0; }
				1330
				1331	if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl"))
				1332	{ my $ref_content = $he->content();
				1333	if (defined $ref_content)
				1334	{ $process_if_footnotes_expect_dl_next = 0;
				1335	my @content = @{$ref_content};
				1336	for (my $i=0; $i<$#content; $i+=2)
				1337	{ my $he_dt = $content[$i];
				1338	my $he_dd = $content[$i+1];
				1339	if (($he_dt->tag ne "dt") \|\| ($he_dd->tag ne "dd"))
				1340	{ $he->dump;
				1341	die "expected <DT> and <DD> at positions $i and ", $i+1; }
				1342	my @dt_content = @{$he_dt->content()};
				1343	if ((scalar(@dt_content) != 2)
				1344	\|\| ($dt_content[0]->tag ne "a")
				1345	\|\| ($dt_content[1]->tag ne "a"))
				1346	{ $he_dt->dump;
				1347	die "Expected 2 anchors as content of <DT>"; }
				1348	my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]);
				1349	my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]);
				1350	# unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content
				1351	if ($dt1_name ne $dt2_name)
				1352	{ $he_dt->dump;
				1353	die "Expected identical names for anchors"; }
				1354	html_replace_by_ignore($he_dd);
				1355	$he_dd->tag("div"); # has no effect
				1356	$footnotes{$dt1_name} = $he_dd; }
				1357	html_replace_by_ignore($he);
				1358	return 0; } }
				1359
				1360	if ($process_if_footnotes_expect_dl_next)
				1361	{ $he->dump;
				1362	die "Expected <DL> for footnotes next"; }
				1363
				1364	return 1;
				1365	}
				1366
				1367
				1368
				1369	## Merge two adjacent paragraphs containing <DL> items, such as:
				1370	# <P>
				1371	# <DL>
				1372	# <DT>
				1373	# ...
				1374	# <DD>
				1375	# ...
				1376	# <P>
				1377	# <DL>
				1378	# <DT>
				1379	# ...
				1380	# <DD>
				1381	# ...
				1382
				1383	sub merge_dl ( $$$ )
				1384	{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
				1385	if (!$startflag)
				1386	{ return; }
				1387
				1388	my $ref_content = $he->content;
				1389	if (!defined $ref_content)
				1390	{ return; }
				1391	my $i = 0;
				1392	while ($i < scalar(@{$ref_content})-1)
				1393	{ my $p1 = $ {$ref_content}[$i];
				1394	if ((ref $p1) && ($p1->tag eq "p")
				1395	&& has_single_content_with_tag($p1, "dl"))
				1396	{ my $dl1 = $ {$p1->content}[0];
				1397	# In this loop, rhs, not lhs, of < comparison changes,
				1398	# because we are removing elements from the content of $he.
				1399	while ($i < scalar(@{$ref_content})-1)
				1400	{ my $p2 = $ {$ref_content}[$i+1];
				1401	if (!((ref $p2) && ($p2->tag eq "p")
				1402	&& has_single_content_with_tag($p2, "dl")))
				1403	{ last; }
				1404	# Merge these two elements.
				1405	splice(@{$ref_content}, $i+1, 1); # remove $p2
				1406	my $dl2 = $ {$p2->content}[0];
				1407	$dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1
				1408	}
				1409	# extra increment because next element isn't a candidate for $p1
				1410	$i++; }
				1411	$i++; }
				1412	return 1;
				1413	}
				1414
				1415
				1416
				1417	###########################################################################
				1418	### Testing
				1419	###
				1420
				1421	sub test ( $$ )
				1422	{ my ($action, $file) = check_args(2, @_);
				1423
				1424	# General testing
				1425	if (($action eq "view") \|\| ($action eq ""))
				1426	{ # # $file = "/homes/gws/mernst/www/links.html";
				1427	# # $file = "/homes/gws/mernst/www/index.html";
				1428	# # $file = "/homes/fish/mernst/java/gud/doc/manual.html";
				1429	# # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html";
				1430	# # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html";
				1431	# $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
				1432	my $tree = file_to_tree($file);
				1433
				1434	## Testing
				1435	# print STDERR $tree->as_HTML;
				1436	$tree->dump();
				1437
				1438	# print STDERR $tree->tag(), "\n";
				1439	# print STDERR @{$tree->content()}, "\n";
				1440	#
				1441	# for (@{ $tree->extract_links(qw(a img)) }) {
				1442	# my ($link, $linkelem) = @$_;
				1443	# print STDERR "$link ", $linkelem->as_HTML;
				1444	# }
				1445	#
				1446	# print STDERR @{$tree->extract_links()}, "\n";
				1447
				1448	# my @top_level_elts = @{$tree->content()};
				1449
				1450	# if scalar(@{$tree->content()})
				1451	return;
				1452	}
				1453
				1454	elsif ($action eq "raw")
				1455	{ my $tree = new HTML::TreeBuilder;
				1456	$tree->ignore_unknown(1);
				1457	# $tree->warn(1);
				1458	$tree->parse_file($file);
				1459
				1460	$tree->dump();
				1461
				1462	# cleanup_parse_tree($tree);
				1463	# $tree->dump();
				1464	return;
				1465	}
				1466
				1467	# Test dealing with a section.
				1468	elsif ($action eq "section")
				1469	{ # my $file;
				1470	# $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html";
				1471	# $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html";
				1472	# $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
				1473	process_section_file($file, 0, "Title");
				1474	}
				1475
				1476	# Test dealing with many sections
				1477	elsif (0)
				1478	{ my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html",
				1479	"/homes/fish/mernst/tmp/python-doc/html/api/abstract.html",
				1480	"/homes/fish/mernst/tmp/python-doc/html/api/api.html",
				1481	"/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html",
				1482	"/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html",
				1483	"/homes/fish/mernst/tmp/python-doc/html/api/concrete.html",
				1484	# "/homes/fish/mernst/tmp/python-doc/html/api/contents.html",
				1485	"/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html",
				1486	"/homes/fish/mernst/tmp/python-doc/html/api/debugging.html",
				1487	"/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html",
				1488	"/homes/fish/mernst/tmp/python-doc/html/api/embedding.html",
				1489	"/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html",
				1490	"/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html",
				1491	"/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html",
				1492	"/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html",
				1493	"/homes/fish/mernst/tmp/python-doc/html/api/front.html",
				1494	"/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html",
				1495	# "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html",
				1496	"/homes/fish/mernst/tmp/python-doc/html/api/importing.html",
				1497	"/homes/fish/mernst/tmp/python-doc/html/api/includes.html",
				1498	"/homes/fish/mernst/tmp/python-doc/html/api/index.html",
				1499	"/homes/fish/mernst/tmp/python-doc/html/api/initialization.html",
				1500	"/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html",
				1501	"/homes/fish/mernst/tmp/python-doc/html/api/intro.html",
				1502	"/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html",
				1503	"/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html",
				1504	"/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html",
				1505	"/homes/fish/mernst/tmp/python-doc/html/api/mapping.html",
				1506	"/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html",
				1507	"/homes/fish/mernst/tmp/python-doc/html/api/node24.html",
				1508	"/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html",
				1509	"/homes/fish/mernst/tmp/python-doc/html/api/number.html",
				1510	"/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html",
				1511	"/homes/fish/mernst/tmp/python-doc/html/api/object.html",
				1512	"/homes/fish/mernst/tmp/python-doc/html/api/objects.html",
				1513	"/homes/fish/mernst/tmp/python-doc/html/api/os.html",
				1514	"/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html",
				1515	"/homes/fish/mernst/tmp/python-doc/html/api/processControl.html",
				1516	"/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html",
				1517	"/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html",
				1518	"/homes/fish/mernst/tmp/python-doc/html/api/sequence.html",
				1519	"/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html",
				1520	"/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html",
				1521	"/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html",
				1522	"/homes/fish/mernst/tmp/python-doc/html/api/threads.html",
				1523	"/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html",
				1524	"/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html",
				1525	"/homes/fish/mernst/tmp/python-doc/html/api/types.html",
				1526	"/homes/fish/mernst/tmp/python-doc/html/api/utilities.html",
				1527	"/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html");
				1528	for my $file (@files)
				1529	{ print STDERR "\n", "=" x 75, "\n", "$file:\n";
				1530	process_section_file($file, 0, "Title");
				1531	}
				1532	}
				1533
				1534	# Test dealing with index.
				1535	elsif ($action eq "index")
				1536	{ # my $file;
				1537	# $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html";
				1538
				1539	process_index_file($file, "\@cindex");
				1540	print_index_info();
				1541	}
				1542
				1543	else
				1544	{ die "Unrecognized action `$action'"; }
				1545	}
				1546
				1547
				1548	###########################################################################
				1549	### Main loop
				1550	###
				1551
				1552	sub process_contents_file ( $ )
				1553	{ my ($file) = check_args(1, @_);
				1554
				1555	# could also use File::Basename
				1556	my $info_file = $file;
				1557	$info_file =~ s/(\/?index)?\.html$//;
				1558	if ($info_file eq "")
				1559	{ chomp($info_file = `pwd`); }
				1560	$info_file =~ s/^.*\///; # not the most efficient way to remove dirs
				1561
				1562	$html_directory = $file;
				1563	$html_directory =~ s/(\/\|^)[^\/]+$/$1/;
				1564
				1565	my $texi_file = "$info_file.texi";
				1566	open(TEXI, ">$texi_file");
				1567
				1568	print TEXI "\\input texinfo \@c --texinfo--\n";
				1569	print TEXI "\@c %**start of header\n";
				1570	print TEXI "\@setfilename $info_file\n";
				1571
				1572	# 2. Summary Description and Copyright
				1573	# The "Summary Description and Copyright" segment describes the
				1574	# document and contains the copyright notice and copying permissions
				1575	# for the Info file. The segment must be enclosed between `@ifinfo'
				1576	# and `@end ifinfo' commands so that the formatters place it only in
				1577	# the Info file.
				1578	#
				1579	# The summary description and copyright segment does not appear in the
				1580	# printed document.
				1581	#
				1582	# @ifinfo
				1583	# This is a short example of a complete Texinfo file.
				1584	#
				1585	# Copyright @copyright{} 1990 Free Software Foundation, Inc.
				1586	# @end ifinfo
				1587
				1588
				1589	# 3. Title and Copyright
				1590	# The "Title and Copyright" segment contains the title and copyright
				1591	# pages and copying permissions for the printed manual. The segment
				1592	# must be enclosed between `@titlepage' and `@end titlepage'
				1593	# commands. The title and copyright page appear only in the printed
				1594	# manual.
				1595	#
				1596	# The titlepage segment does not appear in the Info file.
				1597	#
				1598	# @titlepage
				1599	# @sp 10
				1600	# @comment The title is printed in a large font.
				1601	# @center @titlefont{Sample Title}
				1602	#
				1603	# @c The following two commands start the copyright page.
				1604	# @page
				1605	# @vskip 0pt plus 1filll
				1606	# Copyright @copyright{} 1990 Free Software Foundation, Inc.
				1607	# @end titlepage
				1608
				1609
				1610	# 4. `Top' Node and Master Menu
				1611	# The "Master Menu" contains a complete menu of all the nodes in the
				1612	# whole Info file. It appears only in the Info file, in the `Top'
				1613	# node.
				1614	#
				1615	# The `Top' node contains the master menu for the Info file. Since a
				1616	# printed manual uses a table of contents rather than a menu, the master
				1617	# menu appears only in the Info file.
				1618	#
				1619	# @node Top, First Chapter, , (dir)
				1620	# @comment node-name, next, previous, up
				1621	#
				1622	# @menu
				1623	# * First Chapter:: The first chapter is the
				1624	# only chapter in this sample.
				1625	# * Concept Index:: This index has two entries.
				1626	# @end menu
				1627
				1628
				1629
				1630	$current_ref_tdf = [ "Top", 0, $ARGV[0] ];
				1631	process_section_file($file, 0, "Top");
				1632	while (scalar(@contents_list))
				1633	{ $current_ref_tdf = shift @contents_list;
				1634	process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]);
				1635	}
				1636
				1637	print TEXI "\n";
				1638	for my $indextitle (@index_titles)
				1639	{ print TEXI "\@node $indextitle\n";
				1640	print TEXI "\@unnumbered $indextitle\n";
				1641	print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n";
				1642	print TEXI "\n"; }
				1643
				1644	print TEXI "\@contents\n";
				1645	print TEXI "\@bye\n";
				1646	close(TEXI);
				1647	}
				1648
				1649	# This needs to be last so global variable initializations are reached.
				1650
				1651	if (scalar(@ARGV) == 0)
				1652	{ die "No arguments supplied to html2texi.pl"; }
				1653
				1654	if ($ARGV[0] eq "-test")
				1655	{ my @test_args = @ARGV[1..$#ARGV];
				1656	if (scalar(@test_args) == 0)
				1657	{ test("", "index.html"); }
				1658	elsif (scalar(@test_args) == 1)
				1659	{ test("", $test_args[0]); }
				1660	elsif (scalar(@test_args) == 2)
				1661	{ test($test_args[0], $test_args[1]); }
				1662	else
				1663	{ die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); }
				1664	exit();
				1665	}
				1666
				1667	if (scalar(@ARGV) != 1)
				1668	{ die "Pass one argument, the main/contents page"; }
				1669
				1670	process_contents_file($ARGV[0]);