blob: be050b1667313570e8db4eae180419ba45e5fd99 [file] [log] [blame]
Fred Drake54bad441999-01-14 18:17:07 +00001#! /usr/bin/env perl
Fred Drake3fe1d321999-01-08 15:25:29 +00002# html2texi.pl -- Convert HTML documentation to Texinfo format
3# Michael Ernst <mernst@cs.washington.edu>
Fred Drake54bad441999-01-14 18:17:07 +00004# Time-stamp: <1999-01-12 21:34:27 mernst>
Fred Drake3fe1d321999-01-08 15:25:29 +00005
6# This program converts HTML documentation trees into Texinfo format.
7# Given the name of a main (or contents) HTML file, it processes that file,
8# and other files (transitively) referenced by it, into a Texinfo file
9# (whose name is chosen from the file or directory name of the argument).
10# For instance:
Fred Drake54bad441999-01-14 18:17:07 +000011# html2texi.pl api/index.html
Fred Drake3fe1d321999-01-08 15:25:29 +000012# produces file "api.texi".
13
14# Texinfo format can be easily converted to Info format (for browsing in
15# Emacs or the standalone Info browser), to a printed manual, or to HTML.
16# Thus, html2texi.pl permits conversion of HTML files to Info format, and
17# secondarily enables producing printed versions of Web page hierarchies.
18
19# Unlike HTML, Info format is searchable. Since Info is integrated into
20# Emacs, one can read documentation without starting a separate Web
21# browser. Additionally, Info browsers (including Emacs) contain
22# convenient features missing from Web browsers, such as easy index lookup
23# and mouse-free browsing.
24
25# Limitations:
Fred Drake54bad441999-01-14 18:17:07 +000026# html2texi.pl is currently tuned to latex2html output (and it corrects
27# several latex2html bugs), but should be extensible to arbitrary HTML
28# documents. It will be most useful for HTML with a hierarchical structure
29# and an index, and it recognizes those features as created by latex2html
30# (and possibly by some other tools). The HTML tree to be traversed must
31# be on local disk, rather than being accessed via HTTP.
Fred Drake3fe1d321999-01-08 15:25:29 +000032# This script requires the use of "checkargs.pm". To eliminate that
33# dependence, replace calls to check_args* by @_ (which is always the last
34# argument to those functions).
35# Also see the "to do" section, below.
36# Comments, suggestions, bug fixes, and enhancements are welcome.
37
Fred Drake54bad441999-01-14 18:17:07 +000038# Troubleshooting:
39# Malformed HTML can cause this program to abort, so
40# you should check your HTML files to make sure they are legal.
41
42
Fred Drake3fe1d321999-01-08 15:25:29 +000043###
44### Typical usage for the Python documentation:
45###
46
47# (Actually, most of this is in a Makefile instead.)
48# The resulting Info format Python documentation is currently available at
49# ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz
50
Fred Drake54bad441999-01-14 18:17:07 +000051# Fix up HTML problems, eg <DT><DL COMPACT><DD> should be <DT><DL COMPACT><DD>.
Fred Drake3fe1d321999-01-08 15:25:29 +000052
53# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html
54# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html
55# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html
56# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html
57# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html
58# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html
59
60# Edit the generated .texi files:
61# * change @setfilename to prefix "python-"
62# * fix up any sectioning, such as for Abstract
63# * make Texinfo menus
64# * perhaps remove the @detailmenu ... @end detailmenu
Fred Drake54bad441999-01-14 18:17:07 +000065# In Emacs, to do all this:
Fred Drake3fe1d321999-01-08 15:25:29 +000066# (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))
67
68# makeinfo api.texi
69# makeinfo ext.texi
70# makeinfo lib.texi
71# makeinfo mac.texi
72# makeinfo ref.texi
73# makeinfo tut.texi
74
75
76###
77### Structure of the code
78###
79
80# To be written...
81
82
83###
84### Design decisions
85###
86
87# Source and destination languages
88# --------------------------------
89#
90# The goal is Info files; I create Texinfo, so I don't have to worry about
91# the finer details of Info file creation. (I'm not even sure of its exact
92# format.)
93#
94# Why not start from LaTeX rather than HTML?
95# I could hack latex2html itself to produce Texinfo instead, or fix up
96# partparse.py (which already translates LaTeX to Teinfo).
97# Pros:
98# * has high-level information such as index entries, original formatting
99# Cons:
100# * those programs are complicated to read and understand
101# * those programs try to handle arbitrary LaTeX input, track catcodes,
102# and more: I don't want to go to that effort. HTML isn't as powerful
103# as LaTeX, so there are fewer subtleties.
104# * the result wouldn't work for arbitrary HTML documents; it would be
105# nice to eventually extend this program to HTML produced from Docbook,
106# Frame, and more.
107
108# Parsing
109# -------
110#
111# I don't want to view the text as a linear stream; I'd rather parse the
112# whole thing and then do pattern matching over the parsed representation (to
113# find idioms such as indices, lists of child nodes, etc.).
114# * Perl provides HTML::TreeBuilder, which does just what I want.
115# * libwww-perl: http://www.linpro.no/lwp/
116# * TreeBuilder: HTML-Tree-0.51.tar.gz
117# * Python Parsers, Formatters, and Writers don't really provide the right
118# interface (and the version in Grail doesn't correspond to another
119# distributed version, so I'm confused about which to be using). I could
120# write something in Python that creates a parse tree, but why bother?
121
122# Other implementation language issues:
123# * Python lacks variable declarations, reasonable scoping, and static
124# checking tools. I've written some of the latter for myself that make
125# my Perl programming a lot safer than my Python programming will be until
126# I have a similar suite for that language.
127
128
129###########################################################################
130### To do
131###
132
133# Section names:
134# Fix the problem with multiple sections in a single file (eg, Abstract in
135# Front Matter section).
136# Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310
137# Index:
138# Perhaps double-check that every tag mentioned in the index is found
139# in the text.
140# Python: email to python-docs@python.org, to get their feedback.
141# Compare to existing lib/ Info manual
142# Write the hooks into info-look; replace pyliblookup1-1.tar.gz.
143# Postpass to remove extra quotation marks around typography already in
144# a different font (to avoid double delimiters as in "`code'"); or
145# perhaps consider using only font-based markup so that we don't get
146# the extra *bold* and `code' markup in Info.
147
148## Perhaps don't rely on automatic means for adding up, next, prev; I have
149## all that info available to me already, so it's not so much trouble to
150## add it. (Right?) But it is *so* easy to use Emacs instead...
151
152
153###########################################################################
154### Strictures
155###
156
157# man HTML::TreeBuilder
158# man HTML::Parser
159# man HTML::Element
160
161# require HTML::ParserWComment;
162require HTML::Parser;
163require HTML::TreeBuilder;
164require HTML::Element;
165
166use File::Basename;
Fred Drake3fe1d321999-01-08 15:25:29 +0000167
168use strict;
169# use Carp;
170
Fred Drake3fe1d321999-01-08 15:25:29 +0000171use checkargs;
172
173
174###########################################################################
175### Variables
176###
177
178my @section_stack = (); # elements are chapter/section/subsec nodetitles (I think)
179my $current_ref_tdf; # for the file currently being processed;
180 # used in error messages
181my $html_directory;
182my %footnotes;
183
184# First element should not be used.
185my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection");
186
187my %inline_markup = ("b" => "strong",
188 "code" => "code",
189 "i" => "emph",
190 "kbd" => "kbd",
191 "samp" => "samp",
192 "strong" => "strong",
193 "tt" => "code",
194 "var" => "var");
195
196my @deferred_index_entries = ();
197
198my @index_titles = (); # list of (filename, type) lists
199my %index_info = ("Index" => ["\@blindex", "bl"],
200 "Concept Index" => ["\@cindex", "cp"],
201 "Module Index" => ["\@mdindex", "md"]);
202
203
204###########################################################################
205### Main/contents page
206###
207
208# Process first-level page on its own, or just a contents page? Well, I do
209# want the title, author, etc., and the front matter... For now, just add
210# that by hand at the end.
211
212
213# data structure possibilities:
214# * tree-like (need some kind of stack when processing (or parent pointers))
215# * list of name and depth; remember old and new depths.
216
217# Each element is a reference to a list of (nodetitle, depth, filename).
218my @contents_list = ();
219
220# The problem with doing fixups on the fly is that some sections may have
221# already been processed (and no longer available) by the time we notice
222# others with the same name. It's probably better to fully construct the
223# contents list (reading in all files of interest) upfront; that will also
224# let me do a better job with cross-references, because again, all files
225# will already be read in.
226my %contents_hash = ();
227my %contents_fixups = ();
228
229my @current_contents_list = ();
230
231# Merge @current_contents_list into @contents_list,
232# and set @current_contents_list to be empty.
233sub merge_contents_lists ( )
234{ check_args(0, @_);
235
236 # Three possibilities:
237 # * @contents_list is empty: replace it by @current_contents_list.
238 # * prefixes of the two lists are identical: do nothing
239 # * @current_contents_list is all at lower level than $contents_list[0];
240 # prefix @contents_list by @current_contents_list
241
242 if (scalar(@current_contents_list) == 0)
243 { die "empty current_contents_list"; }
244
245 # if (scalar(@contents_list) == 0)
246 # { @contents_list = @current_contents_list;
247 # @current_contents_list = ();
248 # return; }
249
250 # if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1]))
251 # { unshift @contents_list, @current_contents_list;
252 # @current_contents_list = ();
253 # return; }
254
255 for (my $i=0; $i<scalar(@current_contents_list); $i++)
256 { my $ref_c_tdf = $current_contents_list[$i];
257 if ($i >= scalar(@contents_list))
258 { push @contents_list, $ref_c_tdf;
259 my $title = $ {$ref_c_tdf}[0];
260 if (defined $contents_hash{$title})
261 { $contents_fixups{$title} = 1; }
262 else
263 { $contents_hash{$title} = 1; }
264 next; }
265 my $ref_tdf = $contents_list[$i];
266 my ($title, $depth, $file) = @{$ref_tdf};
267 my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf};
268
269 if (($title ne $c_title)
270 && ($depth < $c_depth)
271 && ($file ne $c_file))
272 { splice @contents_list, $i, 0, $ref_c_tdf;
273 if (defined $contents_hash{$c_title})
274 { $contents_fixups{$c_title} = 1; }
275 else
276 { $contents_hash{$c_title} = 1; }
277 next; }
278
279 if (($title ne $c_title)
280 || ($depth != $c_depth)
281 || ($file ne $c_file))
282 { die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:",
283 "\n main: <<<$title>>> $depth $file",
284 "\n curr: <<<$c_title>>> $c_depth $c_file"); }
285 }
286 @current_contents_list = ();
287}
288
289
290
291# Set @current_contents_list to a list of (title, href, sectionlevel);
292# then merge that list into @contents_list.
293# Maybe this function should also produce a map
294# from title (or href) to sectionlevel (eg "chapter"?).
295sub process_child_links ( $ )
296{ my ($he) = check_args(1, @_);
297
Fred Drake54bad441999-01-14 18:17:07 +0000298 # $he->dump();
Fred Drake3fe1d321999-01-08 15:25:29 +0000299 if (scalar(@current_contents_list) != 0)
300 { die "current_contents_list nonempty: @current_contents_list"; }
301 $he->traverse(\&increment_current_contents_list, 'ignore text');
302
303 # Normalize the depths; for instance, convert 1,3,5 into 0,1,2.
304 my %depths = ();
305 for my $ref_tdf (@current_contents_list)
306 { $depths{$ {$ref_tdf}[1]} = 1; }
307 my @sorted_depths = sort keys %depths;
308 my $current_depth = scalar(@section_stack)-1;
309 my $current_depth_2 = $ {$current_ref_tdf}[1];
310 if ($current_depth != $current_depth_2)
311 { die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); }
312 for (my $i=0; $i<scalar(@sorted_depths); $i++)
313 { $depths{$sorted_depths[$i]} = $i + $current_depth+1; }
314 for my $ref_tdf (@current_contents_list)
315 { $ {$ref_tdf}[1] = $depths{$ {$ref_tdf}[1]}; }
316
317 # Eliminate uninteresting sections. Hard-coded hack for now.
318 if ($ {$current_contents_list[-1]}[0] eq "About this document ...")
319 { pop @current_contents_list; }
320 if ((scalar(@current_contents_list) > 1)
321 && ($ {$current_contents_list[1]}[0] eq "Contents"))
322 { my $ref_first_tdf = shift @current_contents_list;
323 $current_contents_list[0] = $ref_first_tdf; }
324
325 for (my $i=0; $i<scalar(@current_contents_list); $i++)
326 { my $ref_tdf = $current_contents_list[$i];
327 my $title = $ {$ref_tdf}[0];
328 if (exists $index_info{$title})
329 { my $index_file = $ {$ref_tdf}[2];
330 my ($indexing_command, $suffix) = @{$index_info{$title}};
331 process_index_file($index_file, $indexing_command);
332 print TEXI "\n\@defindex $suffix\n";
333 push @index_titles, $title;
334 splice @current_contents_list, $i, 1;
335 $i--; }
336 elsif ($title =~ /\bIndex$/)
337 { print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n"; } }
338
339 merge_contents_lists();
340
341 # print_contents_list();
342 # print_index_info();
343}
344
345
346sub increment_current_contents_list ( $$$ )
347{ my ($he, $startflag, $depth) = check_args(3, @_);
348 if (!$startflag)
349 { return; }
350
351 if ($he->tag eq "li")
352 { my @li_content = @{$he->content};
353 if ($li_content[0]->tag ne "a")
354 { die "first element of <LI> should be <A>"; }
355 my ($name, $href, @content) = anchor_info($li_content[0]);
356 # unused $name
357 my $title = join("", collect_texts($li_content[0]));
358 $title = texi_remove_punctuation($title);
359 # The problem with these is that they are formatted differently in
360 # @menu and @node!
361 $title =~ s/``/\"/g;
362 $title =~ s/''/\"/g;
363 $title =~ s/ -- / /g;
364 push @current_contents_list, [ $title, $depth, $href ]; }
365 return 1;
366}
367
368# Simple version for section titles
369sub html_to_texi ( $ )
370{ my ($he) = check_args(1, @_);
371 if (!ref $he)
372 { return $he; }
373
374 my $tag = $he->tag;
375 if (exists $inline_markup{$tag})
376 { my $result = "\@$inline_markup{$tag}\{";
377 for my $elt (@{$he->content})
378 { $result .= html_to_texi($elt); }
379 $result .= "\}";
380 return $result; }
381 else
Fred Drake54bad441999-01-14 18:17:07 +0000382 { $he->dump();
Fred Drake3fe1d321999-01-08 15:25:29 +0000383 die "html_to_texi confused by <$tag>"; }
384}
385
386
387
388sub print_contents_list ()
389{ check_args(0, @_);
390 print STDERR "Contents list:\n";
391 for my $ref_tdf (@contents_list)
392 { my ($title, $depth, $file) = @{$ref_tdf};
393 print STDERR "$title $depth $file\n"; }
394}
395
396
397
398###########################################################################
399### Index
400###
401
402my $l2h_broken_link_name = "l2h-";
403
404
405# map from file to (map from anchor name to (list of index texts))
406# (The list is needed when a single LaTeX command like \envvar
407# expands to multiple \index commands.)
408my %file_index_entries = ();
409my %this_index_entries; # map from anchor name to (list of index texts)
410
411my %file_index_entries_broken = (); # map from file to (list of index texts)
412my @this_index_entries_broken;
413
414my $index_prefix = "";
415my @index_prefixes = ();
416
417my $this_indexing_command;
418
419sub print_index_info ()
420{ check_args(0, @_);
421 my ($key, $val);
422 for my $file (sort keys %file_index_entries)
423 { my %index_entries = %{$file_index_entries{$file}};
424 print STDERR "file: $file\n";
425 for my $aname (sort keys %index_entries)
426 { my @entries = @{$index_entries{$aname}};
427 if (scalar(@entries) == 1)
428 { print STDERR " $aname : $entries[0]\n"; }
429 else
430 { print STDERR " $aname : ", join("\n " . (" " x length($aname)), @entries), "\n"; } } }
431 for my $file (sort keys %file_index_entries_broken)
432 { my @entries = @{$file_index_entries_broken{$file}};
433 print STDERR "file: $file\n";
434 for my $entry (@entries)
435 { print STDERR " $entry\n"; }
436 }
437}
438
439
440sub process_index_file ( $$ )
441{ my ($file, $indexing_command) = check_args(2, @_);
442 # print "process_index_file $file $indexing_command\n";
443
444 my $he = file_to_tree($html_directory . $file);
445 # $he->dump();
446
447 $this_indexing_command = $indexing_command;
448 $he->traverse(\&process_if_index_dl_compact, 'ignore text');
449 undef $this_indexing_command;
450 # print "process_index_file done\n";
451}
452
453
454sub process_if_index_dl_compact ( $$$ )
455{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
456 if (!$startflag)
457 { return; }
458
459 if (($he->tag() eq "dl") && (defined $he->attr('compact')))
460 { process_index_dl_compact($he);
461 return 0; }
462 else
463 { return 1; }
464}
465
466
467# The elements of a <DL COMPACT> list from a LaTeX2HTML index:
468# * a single space: text to be ignored
469# * <DT> elements with an optional <DD> element following each one
470# Two types of <DT> elements:
471# * Followed by a <DD> element: the <DT> contains a single
472# string, and the <DD> contains a whitespace string to be ignored, a
473# <DL COMPACT> to be recursively processed (with the <DT> string as a
474# prefix), and a whitespace string to be ignored.
475# * Not followed by a <DD> element: contains a list of anchors
476# and texts (ignore the texts, which are only whitespace and commas).
477# Optionally contains a <DL COMPACT> to be recursively processed (with
478# the <DT> string as a prefix)
479sub process_index_dl_compact ( $ )
480{ my ($h) = check_args(1, @_);
481 my @content = @{$h->content()};
482 for (my $i = 0; $i < scalar(@content); $i++)
483 { my $this_he = $content[$i];
484 if ($this_he->tag ne "dt")
Fred Drake54bad441999-01-14 18:17:07 +0000485 { $this_he->dump();
Fred Drake3fe1d321999-01-08 15:25:29 +0000486 die "Expected <DT> tag: " . $this_he->tag; }
487 if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd"))
488 { process_index_dt_and_dd($this_he, $content[$i+1]);
489 $i++; }
490 else
491 { process_index_lone_dt($this_he); } } }
492
493
494
495# Argument is a <DT> element. If it contains more than one anchor, then
496# the texts of all subsequent ones are "[Link]". Example:
497# <DT>
498# <A HREF="embedding.html#l2h-201">
499# "$PATH"
500# ", "
501# <A HREF="embedding.html#l2h-205">
502# "[Link]"
503# Optionally contains a <DL COMPACT> as well. Example:
504# <DT>
505# <A HREF="types.html#l2h-616">
506# "attribute"
507# <DL COMPACT>
508# <DT>
509# <A HREF="assignment.html#l2h-3074">
510# "assignment"
511# ", "
512# <A HREF="assignment.html#l2h-3099">
513# "[Link]"
514# <DT>
515# <A HREF="types.html#l2h-">
516# "assignment, class"
517
518sub process_index_lone_dt ( $ )
519{ my ($dt) = check_args(1, @_);
520 my @dtcontent = @{$dt->content()};
521 my $acontent;
522 my $acontent_suffix;
523 for my $a (@dtcontent)
524 { if ($a eq ", ")
525 { next; }
526 if (!ref $a)
527 { $dt->dump;
528 die "Unexpected <DT> string element: $a"; }
529
530 if ($a->tag eq "dl")
531 { push @index_prefixes, $index_prefix;
532 if (!defined $acontent_suffix)
533 { die "acontent_suffix not yet defined"; }
534 $index_prefix .= $acontent_suffix . ", ";
535 process_index_dl_compact($a);
536 $index_prefix = pop(@index_prefixes);
537 return; }
538
539 if ($a->tag ne "a")
540 { $dt->dump;
541 $a->dump;
542 die "Expected anchor in lone <DT>"; }
543
544 my ($aname, $ahref, @acontent) = anchor_info($a);
545 # unused $aname
546 if (scalar(@acontent) != 1)
547 { die "Expected just one content of <A> in <DT>: @acontent"; }
548 if (ref $acontent[0])
549 { $acontent[0]->dump;
550 die "Expected string content of <A> in <DT>: $acontent[0]"; }
551 if (!defined($acontent))
552 { $acontent = $index_prefix . $acontent[0];
553 $acontent_suffix = $acontent[0]; }
554 elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0])))
555 { die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; }
556
557 if (!defined $ahref)
558 { $dt->dump;
559 die "no HREF in nachor in <DT>"; }
560 my ($ahref_file, $ahref_name) = split(/\#/, $ahref);
561 if (!defined $ahref_name)
562 { # Reference to entire file
563 $ahref_name = ""; }
564
565 if ($ahref_name eq $l2h_broken_link_name)
566 { if (!exists $file_index_entries_broken{$ahref_file})
567 { $file_index_entries_broken{$ahref_file} = []; }
568 push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent";
569 next; }
570
571 if (!exists $file_index_entries{$ahref_file})
572 { $file_index_entries{$ahref_file} = {}; }
573 # Don't do this! It appears to make a copy, which is not desired.
574 # my %index_entries = %{$file_index_entries{$ahref_file}};
575 if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name})
576 { $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; }
577 # { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name};
578 # if ($acontent eq $oldcontent)
579 # { die "Multiple identical index entries?"; }
580 # die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; }
581
582 push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent";
583 # print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n";
584 }
585}
586
587sub process_index_dt_and_dd ( $$ )
588{ my ($dt, $dd) = check_args(2, @_);
589 my $dtcontent;
590 { my @dtcontent = @{$dt->content()};
591 if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0]))
592 { $dd->dump;
593 $dt->dump;
594 die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of <DT>: @dtcontent"; }
595 $dtcontent = $dtcontent[0];
596 $dtcontent =~ s/ +$//; }
597 my $ddcontent;
598 { my @ddcontent = @{$dd->content()};
599 if (scalar(@ddcontent) != 1)
600 { die "Expected single <DD> content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; }
601 $ddcontent = $ddcontent[0]; }
602 if ($ddcontent->tag ne "dl")
603 { die "Expected <DL> as content of <DD>, but saw: $ddcontent"; }
604
605 push @index_prefixes, $index_prefix;
606 $index_prefix .= $dtcontent . ", ";
607 process_index_dl_compact($ddcontent);
608 $index_prefix = pop(@index_prefixes);
609}
610
611
612###########################################################################
613### Ordinary sections
614###
615
616sub process_section_file ( $$$ )
617{ my ($file, $depth, $nodetitle) = check_args(3, @_);
618 my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file);
619
620 # print STDERR "process_section_file: $file $depth $nodetitle\n";
621
622 # Equivalently:
623 # while ($depth >= scalar(@section_stack)) { pop(@section_stack); }
624 @section_stack = @section_stack[0..$depth-1];
625
626 # Not a great nodename fixup scheme; need a more global view
627 if ((defined $contents_fixups{$nodetitle})
628 && (scalar(@section_stack) > 0))
629 { my $up_title = $section_stack[$#section_stack];
630 # hack for Python Standard Library
631 $up_title =~ s/^(Built-in|Standard) Module //g;
632 my ($up_first_word) = split(/ /, $up_title);
633 $nodetitle = "$up_first_word $nodetitle";
634 }
635
636 push @section_stack, $nodetitle;
637 # print STDERR "new section_stack: ", join(", ", @section_stack), "\n";
638
639 $he->traverse(\&process_if_child_links, 'ignore text');
640 %footnotes = ();
641 # $he->dump;
642 $he->traverse(\&process_if_footnotes, 'ignore text');
643
644 # $he->dump;
645
646 if (exists $file_index_entries{$file})
647 { %this_index_entries = %{$file_index_entries{$file}};
648 # print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n";
649 }
650 else
651 { # print STDERR "Warning: no index entries for file $file\n";
652 %this_index_entries = (); }
653
654 if (exists $file_index_entries_broken{$file})
655 { @this_index_entries_broken = @{$file_index_entries_broken{$file}}; }
656 else
657 { # print STDERR "Warning: no index entries for file $file\n";
658 @this_index_entries_broken = (); }
659
660
661 if ($he->tag() ne "html")
662 { die "Expected <HTML> at top level"; }
663 my @content = @{$he->content()};
664 if ((!ref $content[0]) or ($content[0]->tag ne "head"))
665 { $he->dump;
666 die "<HEAD> not first element of <HTML>"; }
667 if ((!ref $content[1]) or ($content[1]->tag ne "body"))
668 { $he->dump;
669 die "<BODY> not second element of <HTML>"; }
670
671 $content[1]->traverse(\&output_body);
672}
673
674# stack of things we're inside that are preventing indexing from occurring now.
675# These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?)
676my @index_deferrers = ();
677
678sub push_or_pop_index_deferrers ( $$ )
679{ my ($tag, $startflag) = check_args(2, @_);
680 if ($startflag)
681 { push @index_deferrers, $tag; }
682 else
683 { my $old_deferrer = pop @index_deferrers;
684 if ($tag ne $old_deferrer)
685 { die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); }
686 do_deferred_index_entries(); }
687}
688
689
690sub label_add_index_entries ( $;$ )
691{ my ($label, $he) = check_args_range(1, 2, @_);
692 # print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n";
693 # $he is the anchor element
694 if (exists $this_index_entries{$label})
695 { push @deferred_index_entries, @{$this_index_entries{$label}};
696 return; }
697
698 if ($label eq $l2h_broken_link_name)
699 { # Try to find some text to use in guessing which links should point here
700 # I should probably only look at the previous element, or if that is
701 # all punctuation, the one before it; collecting all the previous texts
702 # is a bit of overkill.
703 my @anchor_texts = collect_texts($he);
704 my @previous_texts = collect_texts($he->parent, $he);
705 # 4 elements is arbitrary; ought to filter out punctuation and small words
706 # first, then perhaps keep fewer. Perhaps also filter out formatting so
707 # that we can see a larger chunk of text? (Probably not.)
708 # Also perhaps should do further chunking into words, in case the
709 # index term isn't a chunk of its own (eg, was in <tt>...</tt>.
710 my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]);
711
712 my $guessed = 0;
713 for my $text (@candidate_texts)
714 { # my $orig_text = $text;
715 if ($text =~ /^[\"\`\'().?! ]*$/)
716 { next; }
717 if (length($text) <= 2)
718 { next; }
719 # hack for Python manual; maybe defer until failure first time around?
720 $text =~ s/^sys\.//g;
721 for my $iterm (@this_index_entries_broken)
722 { # I could test for zero: LaTeX2HTML's failures in the Python
723 # documentation are only for items of the form "... (built-in...)"
724 if (index($iterm, $text) != -1)
725 { push @deferred_index_entries, $iterm;
726 # print STDERR "Guessing index term `$iterm' for text `$orig_text'\n";
727 $guessed = 1;
728 } } }
729 if (!$guessed)
730 { # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n";
731 }
732 }
733}
734
735
736# Need to add calls to this at various places.
737# Perhaps add HTML::Element argument and do the check for appropriateness
738# here (ie, no action if inside <H1>, etc.).
739sub do_deferred_index_entries ()
740{ check_args(0, @_);
741 if ((scalar(@deferred_index_entries) > 0)
742 && (scalar(@index_deferrers) == 0))
743 { print TEXI "\n", join("\n", @deferred_index_entries), "\n";
744 @deferred_index_entries = (); }
745}
746
747my $table_columns; # undefined if not in a table
748my $table_first_column; # boolean
749
750sub output_body ( $$$ )
751{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
752
753 if (!ref $he)
754 { my $space_index = index($he, " ");
755 if ($space_index != -1)
756 { # Why does
757 # print TEXI texi_quote(substr($he, 0, $space_index+1));
758 # give: Can't locate object method "TEXI" via package "texi_quote"
759 # (Because the definition texi_quote hasn't been seen yet.)
760 print TEXI &texi_quote(substr($he, 0, $space_index+1));
761 do_deferred_index_entries();
762 print TEXI &texi_quote(substr($he, $space_index+1)); }
763 else
764 { print TEXI &texi_quote($he); }
765 return; }
766
767 my $tag = $he->tag();
768
769 # Ordinary text markup first
770 if (exists $inline_markup{$tag})
771 { if ($startflag)
772 { print TEXI "\@$inline_markup{$tag}\{"; }
773 else
774 { print TEXI "\}"; } }
775 elsif ($tag eq "a")
776 { my ($name, $href, @content) = anchor_info($he);
777 if (!$href)
778 { # This anchor is only here for indexing/cross referencing purposes.
779 if ($startflag)
780 { label_add_index_entries($name, $he); }
781 }
782 elsif ($href =~ "^(ftp|http|news):")
783 { if ($startflag)
784 { # Should avoid second argument if it's identical to the URL.
785 print TEXI "\@uref\{$href, "; }
786 else
787 { print TEXI "\}"; }
788 }
789 elsif ($href =~ /^\#(foot[0-9]+)$/)
790 { # Footnote
791 if ($startflag)
792 { # Could double-check name and content, but I'm not
793 # currently storing that information.
794 print TEXI "\@footnote\{";
795 $footnotes{$1}->traverse(\&output_body);
796 print TEXI "\}";
797 return 0; } }
798 else
799 { if ($startflag)
Fred Drake54bad441999-01-14 18:17:07 +0000800 { # cross-references are not active Info links, but no text is lost
801 print STDERR "Can't deal with internal HREF anchors yet:\n";
802 $he->dump; }
Fred Drake3fe1d321999-01-08 15:25:29 +0000803 }
804 }
Fred Drake3fe1d321999-01-08 15:25:29 +0000805 elsif ($tag eq "br")
806 { print TEXI "\@\n"; }
807 elsif ($tag eq "body")
808 { }
809 elsif ($tag eq "center")
810 { if (has_single_content_string($he)
811 && ($ {$he->content}[0] =~ /^ *$/))
812 { return 0; }
813 if ($startflag)
814 { print TEXI "\n\@center\n"; }
815 else
816 { print TEXI "\n\@end center\n"; }
817 }
818 elsif ($tag eq "div")
819 { my $align = $he->attr('align');
820 if (defined($align) && ($align eq "center"))
821 { if (has_single_content_string($he)
822 && ($ {$he->content}[0] =~ /^ *$/))
823 { return 0; }
824 if ($startflag)
825 { print TEXI "\n\@center\n"; }
826 else
827 { print TEXI "\n\@end center\n"; } }
828 }
829 elsif ($tag eq "dl")
830 { # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example"
831 if (has_single_content_with_tag($he, "dd"))
832 { my $he_dd = $ {$he->content}[0];
833 if (has_single_content_with_tag($he_dd, "pre"))
834 { my $he_pre = $ {$he_dd->content}[0];
835 print_pre($he_pre);
836 return 0; } }
837 if ($startflag)
838 { # Could examine the elements, to be cleverer about formatting.
839 # (Also to use ftable, vtable...)
840 print TEXI "\n\@table \@asis\n"; }
841 else
842 { print TEXI "\n\@end table\n"; }
843 }
844 elsif ($tag eq "dt")
845 { push_or_pop_index_deferrers($tag, $startflag);
846 if ($startflag)
847 { print TEXI "\n\@item "; }
848 else
849 { } }
850 elsif ($tag eq "dd")
851 { if ($startflag)
852 { print TEXI "\n"; }
853 else
854 { }
855 if (scalar(@index_deferrers) != 0)
856 { $he->dump;
Fred Drake54bad441999-01-14 18:17:07 +0000857 die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
Fred Drake3fe1d321999-01-08 15:25:29 +0000858 do_deferred_index_entries();
859 }
860 elsif ($tag =~ /^(font|big|small)$/)
861 { # Do nothing for now.
862 }
863 elsif ($tag =~ /^h[1-6]$/)
864 { # We don't need this because we never recursively enter the heading content.
865 # push_or_pop_index_deferrers($tag, $startflag);
866 my $secname = "";
867 my @seclabels = ();
868 for my $elt (@{$he->content})
869 { if (!ref $elt)
870 { $secname .= $elt; }
871 elsif ($elt->tag eq "br")
872 { }
873 elsif ($elt->tag eq "a")
874 { my ($name, $href, @acontent) = anchor_info($elt);
875 if ($href)
876 { $he->dump;
877 $elt->dump;
878 die "Nonsimple anchor in <$tag>"; }
879 if (!defined $name)
880 { die "No NAME for anchor in $tag"; }
881 push @seclabels, $name;
882 for my $subelt (@acontent)
883 { $secname .= html_to_texi($subelt); } }
884 else
885 { $secname .= html_to_texi($elt); } }
886 if ($secname eq "")
887 { die "No section name in <$tag>"; }
888 if (scalar(@section_stack) == 1)
889 { if ($section_stack[-1] ne "Top")
890 { die "Not top? $section_stack[-1]"; }
891 print TEXI "\@settitle $secname\n";
892 print TEXI "\@c %**end of header\n";
893 print TEXI "\n";
894 print TEXI "\@node Top\n";
895 print TEXI "\n"; }
896 else
897 { print TEXI "\n\@node $section_stack[-1]\n";
898 print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; }
899 for my $seclabel (@seclabels)
900 { label_add_index_entries($seclabel); }
901 # This should only happen once per file.
902 label_add_index_entries("");
903 if (scalar(@index_deferrers) != 0)
Fred Drake54bad441999-01-14 18:17:07 +0000904 { $he->dump;
905 die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
Fred Drake3fe1d321999-01-08 15:25:29 +0000906 do_deferred_index_entries();
907 return 0;
908 }
909 elsif ($tag eq "hr")
910 { }
911 elsif ($tag eq "ignore")
912 { # Hack for ignored elements
913 return 0;
914 }
915 elsif ($tag eq "li")
916 { if ($startflag)
917 { print TEXI "\n\n\@item\n";
918 do_deferred_index_entries(); } }
919 elsif ($tag eq "ol")
920 { if ($startflag)
921 { print TEXI "\n\@enumerate \@bullet\n"; }
922 else
923 { print TEXI "\n\@end enumerate\n"; } }
924 elsif ($tag eq "p")
925 { if ($startflag)
926 { print TEXI "\n\n"; }
927 if (scalar(@index_deferrers) != 0)
Fred Drake54bad441999-01-14 18:17:07 +0000928 { $he->dump;
929 die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; }
Fred Drake3fe1d321999-01-08 15:25:29 +0000930 do_deferred_index_entries(); }
931 elsif ($tag eq "pre")
932 { print_pre($he);
933 return 0; }
934 elsif ($tag eq "table")
935 { # Could also indicate common formatting for first column, or
936 # determine relative widths for columns (or determine a prototype row)
937 if ($startflag)
938 { if (defined $table_columns)
939 { $he->dump;
940 die "Can't deal with table nested inside $table_columns-column table"; }
941 $table_columns = table_columns($he);
942 if ($table_columns < 2)
943 { $he->dump;
944 die "Column with $table_columns columns?"; }
945 elsif ($table_columns == 2)
946 { print TEXI "\n\@table \@asis\n"; }
947 else
948 { print TEXI "\n\@multitable \@columnfractions";
949 for (my $i=0; $i<$table_columns; $i++)
950 { print TEXI " ", 1.0/$table_columns; }
951 print TEXI "\n"; } }
952 else
953 { if ($table_columns == 2)
954 { print TEXI "\n\@end table\n"; }
955 else
956 { print TEXI "\n\@end multitable\n"; }
957 undef $table_columns; } }
958 elsif (($tag eq "td") || ($tag eq "th"))
959 { if ($startflag)
960 { if ($table_first_column)
961 { print TEXI "\n\@item ";
962 $table_first_column = 0; }
963 elsif ($table_columns > 2)
964 { print TEXI "\n\@tab "; } }
965 else
966 { print TEXI "\n"; } }
967 elsif ($tag eq "tr")
968 { if ($startflag)
969 { $table_first_column = 1; } }
970 elsif ($tag eq "ul")
971 { if ($startflag)
972 { print TEXI "\n\@itemize \@bullet\n"; }
973 else
974 { print TEXI "\n\@end itemize\n"; } }
975 else
Fred Drake54bad441999-01-14 18:17:07 +0000976 { # I used to have a newline before "output_body" here.
977 print STDERR "output_body: ignoring <$tag> tag\n";
Fred Drake3fe1d321999-01-08 15:25:29 +0000978 $he->dump;
979 return 0; }
980
981 return 1;
982}
983
984sub print_pre ( $ )
985{ my ($he_pre) = check_args(1, @_);
986 if (!has_single_content_string($he_pre))
987 { die "Multiple or non-string content for <PRE>: ", @{$he_pre->content}; }
988 my $pre_content = $ {$he_pre->content}[0];
989 print TEXI "\n\@example";
990 print TEXI &texi_quote($pre_content);
991 print TEXI "\@end example\n";
992}
993
994sub table_columns ( $ )
995{ my ($table) = check_args(1, @_);
996 my $result = 0;
997 for my $row (@{$table->content})
998 { if ($row->tag ne "tr")
999 { $table->dump;
1000 $row->dump;
1001 die "Expected <TR> as table row."; }
1002 $result = max($result, scalar(@{$row->content})); }
1003 return $result;
1004}
1005
1006
1007###########################################################################
1008### Utilities
1009###
1010
1011sub min ( $$ )
1012{ my ($x, $y) = check_args(2, @_);
1013 return ($x < $y) ? $x : $y;
1014}
1015
1016sub max ( $$ )
1017{ my ($x, $y) = check_args(2, @_);
1018 return ($x > $y) ? $x : $y;
1019}
1020
1021sub file_to_tree ( $ )
1022{ my ($file) = check_args(1, @_);
1023
1024 my $tree = new HTML::TreeBuilder;
1025 $tree->ignore_unknown(1);
1026 # $tree->warn(1);
1027 $tree->parse_file($file);
1028 cleanup_parse_tree($tree);
1029 return $tree
1030}
1031
1032
1033sub has_single_content ( $ )
1034{ my ($he) = check_args(1, @_);
1035 if (!ref $he)
1036 { # return 0;
1037 die "Non-reference argument: $he"; }
1038 my $ref_content = $he->content;
1039 if (!defined $ref_content)
1040 { return 0; }
1041 my @content = @{$ref_content};
1042 if (scalar(@content) != 1)
1043 { return 0; }
1044 return 1;
1045}
1046
1047
1048# Return true if the content of the element contains only one element itself,
1049# and that inner element has the specified tag.
1050sub has_single_content_with_tag ( $$ )
1051{ my ($he, $tag) = check_args(2, @_);
1052 if (!has_single_content($he))
1053 { return 0; }
1054 my $content = $ {$he->content}[0];
1055 if (!ref $content)
1056 { return 0; }
1057 my $content_tag = $content->tag;
1058 if (!defined $content_tag)
1059 { return 0; }
1060 return $content_tag eq $tag;
1061}
1062
1063sub has_single_content_string ( $ )
1064{ my ($he) = check_args(1, @_);
1065 if (!has_single_content($he))
1066 { return 0; }
1067 my $content = $ {$he->content}[0];
1068 if (ref $content)
1069 { return 0; }
1070 return 1;
1071}
1072
1073
1074# Return name, href, content. First two may be undefined; third is an array.
1075# I don't see how to determine if there are more attributes.
1076sub anchor_info ( $ )
1077{ my ($he) = check_args(1, @_);
1078 if ($he->tag ne "a")
1079 { $he->dump;
1080 die "passed non-anchor to anchor_info"; }
1081 my $name = $he->attr('name');
1082 my $href = $he->attr('href');
1083 my @content = ();
1084 { my $ref_content = $he->content;
1085 if (defined $ref_content)
1086 { @content = @{$ref_content}; } }
1087 return ($name, $href, @content);
1088}
1089
1090
1091sub texi_quote ( $ )
1092{ my ($text) = check_args(1, @_);
1093 $text =~ s/([\@\{\}])/\@$1/g;
1094 $text =~ s/ -- / --- /g;
1095 return $text;
1096}
1097
1098# Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.
1099sub texi_remove_punctuation ( $ )
1100{ my ($text) = check_args(1, @_);
1101
1102 $text =~ s/^ +//g;
1103 $text =~ s/[ :]+$//g;
1104 $text =~ s/^[1-9][0-9.]* +//g;
1105 $text =~ s/,//g;
1106 # Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- "
1107 # gets converted into " - ", just as "---" would be converted into " -- ",
1108 # so the names end up differing.)
1109 # $text =~ s/:/ -- /g;
1110 $text =~ s/://g;
1111 return $text;
1112}
1113
1114
1115## Do not use this inside `traverse': it throws off the traversal. Use
1116## html_replace_by_ignore or html_replace_by_meta instead.
1117# Returns 1 if success, 0 if failure.
1118sub html_remove ( $;$ )
1119{ my ($he, $parent) = check_args_range(1, 2, @_);
1120 if (!defined $parent)
1121 { $parent = $he->parent; }
1122 my $ref_pcontent = $parent->content;
1123 my @pcontent = @{$ref_pcontent};
1124 for (my $i=0; $i<scalar(@pcontent); $i++)
1125 { if ($pcontent[$i] eq $he)
1126 { splice @{$ref_pcontent}, $i, 1;
1127 $he->parent(undef);
1128 return 1; } }
1129 die "Didn't find $he in $parent";
1130}
1131
1132
1133sub html_replace ( $$;$ )
1134{ my ($orig, $new, $parent) = check_args_range(2, 3, @_);
1135 if (!defined $parent)
1136 { $parent = $orig->parent; }
1137 my $ref_pcontent = $parent->content;
1138 my @pcontent = @{$ref_pcontent};
1139 for (my $i=0; $i<scalar(@pcontent); $i++)
1140 { if ($pcontent[$i] eq $orig)
1141 { $ {$ref_pcontent}[$i] = $new;
1142 $new->parent($parent);
1143 $orig->parent(undef);
1144 return 1; } }
1145 die "Didn't find $orig in $parent";
1146}
1147
1148sub html_replace_by_meta ( $;$ )
1149{ my ($orig, $parent) = check_args_range(1, 2, @_);
1150 my $meta = new HTML::Element "meta";
1151 if (!defined $parent)
1152 { $parent = $orig->parent; }
1153 return html_replace($orig, $meta, $parent);
1154}
1155
1156sub html_replace_by_ignore ( $;$ )
1157{ my ($orig, $parent) = check_args_range(1, 2, @_);
1158 my $ignore = new HTML::Element "ignore";
1159 if (!defined $parent)
1160 { $parent = $orig->parent; }
1161 return html_replace($orig, $ignore, $parent);
1162}
1163
1164
1165
1166###
1167### Collect text elements
1168###
1169
1170my @collected_texts;
1171my $collect_texts_stoppoint;
1172my $done_collecting;
1173
1174sub collect_texts ( $;$ )
1175{ my ($root, $stop) = check_args_range(1, 2, @_);
1176 # print STDERR "collect_texts: $root $stop\n";
1177 $collect_texts_stoppoint = $stop;
1178 $done_collecting = 0;
1179 @collected_texts = ();
1180 $root->traverse(\&collect_if_text); # process texts
1181 # print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";
1182 return @collected_texts;
1183}
1184
1185sub collect_if_text ( $$$ )
1186{ my $he = (check_args(3, @_))[0]; # ignore depth and startflag arguments
1187 if ($done_collecting)
1188 { return 0; }
1189 if (!defined $he)
1190 { return 0; }
1191 if (!ref $he)
1192 { push @collected_texts, $he;
1193 return 0; }
1194 if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint))
1195 { $done_collecting = 1;
1196 return 0; }
1197 return 1;
1198}
1199
1200
1201###########################################################################
1202### Clean up parse tree
1203###
1204
1205sub cleanup_parse_tree ( $ )
1206{ my ($he) = check_args(1, @_);
1207 $he->traverse(\&delete_if_navigation, 'ignore text');
1208 $he->traverse(\&delete_extra_spaces, 'ignore text');
1209 $he->traverse(\&merge_dl, 'ignore text');
Fred Drake54bad441999-01-14 18:17:07 +00001210 $he->traverse(\&reorder_dt_and_dl, 'ignore text');
Fred Drake3fe1d321999-01-08 15:25:29 +00001211 return $he;
1212}
1213
1214
1215## Simpler version that deletes contents but not the element itself.
1216# sub delete_if_navigation ( $$$ )
1217# { my $he = (check_args(3, @_))[0]; # ignore startflag and depth
1218# if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))
1219# { $he->delete();
1220# return 0; }
1221# else
1222# { return 1; }
1223# }
1224
1225sub delete_if_navigation ( $$$ )
1226{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1227 if (!$startflag)
1228 { return; }
1229
1230 if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation'))
1231 { my $ref_pcontent = $he->parent()->content();
1232 # Don't try to modify @pcontent, which appears to be a COPY.
1233 # my @pcontent = @{$ref_pcontent};
1234 for (my $i = 0; $i<scalar(@{$ref_pcontent}); $i++)
1235 { if (${$ref_pcontent}[$i] eq $he)
1236 { splice(@{$ref_pcontent}, $i, 1);
1237 last; } }
1238 $he->delete();
1239 return 0; }
1240 else
1241 { return 1; }
1242}
1243
1244sub delete_extra_spaces ( $$$ )
1245{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1246 if (!$startflag)
1247 { return; }
1248
1249 my $tag = $he->tag;
1250 if ($tag =~ /^(head|html|table|tr|ul)$/)
1251 { delete_child_spaces($he); }
1252 delete_trailing_spaces($he);
1253 return 1;
1254}
1255
1256
1257sub delete_child_spaces ( $ )
1258{ my ($he) = check_args(1, @_);
1259 my $ref_content = $he->content();
1260 for (my $i = 0; $i<scalar(@{$ref_content}); $i++)
1261 { if ($ {$ref_content}[$i] =~ /^ *$/)
1262 { splice(@{$ref_content}, $i, 1);
1263 $i--; } }
1264}
1265
1266sub delete_trailing_spaces ( $ )
1267{ my ($he) = check_args(1, @_);
1268 my $ref_content = $he->content();
1269 if (! defined $ref_content)
1270 { return; }
1271 # Could also check for previous element = /^h[1-6]$/.
1272 for (my $i = 0; $i<scalar(@{$ref_content})-1; $i++)
1273 { if ($ {$ref_content}[$i] =~ /^ *$/)
1274 { my $next_elt = $ {$ref_content}[$i+1];
1275 if ((ref $next_elt) && ($next_elt->tag =~ /^(br|dd|dl|dt|hr|p|ul)$/))
1276 { splice(@{$ref_content}, $i, 1);
1277 $i--; } } }
1278 if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/)
1279 { my $last_elt = $ {$ref_content}[$#{$ref_content}];
1280 if ((defined $last_elt) && ($last_elt =~ /^ *$/))
1281 { pop @{$ref_content}; } }
1282}
1283
1284
Fred Drake54bad441999-01-14 18:17:07 +00001285# LaTeX2HTML sometimes creates
1286# <DT>text
1287# <DL COMPACT><DD>text
1288# which should actually be:
1289# <DL COMPACT>
1290# <DT>text
1291# <DD>text
1292# Since a <DL> gets added, this ends up looking like
1293# <P>
1294# <DL>
1295# <DT>
1296# text1...
1297# <DL COMPACT>
1298# <DD>
1299# text2...
1300# dt_or_dd1...
1301# dt_or_dd2...
1302# which should become
1303# <P>
1304# <DL COMPACT>
1305# <DT>
1306# text1...
1307# <DD>
1308# text2...
1309# dt_or_dd1...
1310# dt_or_dd2...
1311
1312sub reorder_dt_and_dl ( $$$ )
1313{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1314 if (!$startflag)
1315 { return; }
1316
1317 if ($he->tag() eq "p")
1318 { my $ref_pcontent = $he->content();
1319 if (defined $ref_pcontent)
1320 { my @pcontent = @{$ref_pcontent};
1321 # print "reorder_dt_and_dl found a <p>\n"; $he->dump();
1322 if ((scalar(@pcontent) >= 1)
1323 && (ref $pcontent[0]) && ($pcontent[0]->tag() eq "dl")
1324 && $pcontent[0]->implicit())
1325 { my $ref_dlcontent = $pcontent[0]->content();
1326 # print "reorder_dt_and_dl found a <p> and implicit <dl>\n";
1327 if (defined $ref_dlcontent)
1328 { my @dlcontent = @{$ref_dlcontent};
1329 if ((scalar(@dlcontent) >= 1)
1330 && (ref $dlcontent[0]) && ($dlcontent[0]->tag() eq "dt"))
1331 { my $ref_dtcontent = $dlcontent[0]->content();
1332 # print "reorder_dt_and_dl found a <p>, implicit <dl>, and <dt>\n";
1333 if (defined $ref_dtcontent)
1334 { my @dtcontent = @{$ref_dtcontent};
1335 if ((scalar(@dtcontent) > 0)
1336 && (ref $dtcontent[$#dtcontent])
1337 && ($dtcontent[$#dtcontent]->tag() eq "dl"))
1338 { my $ref_dl2content = $dtcontent[$#dtcontent]->content();
1339 # print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, and <dl>\n";
1340 if (defined $ref_dl2content)
1341 { my @dl2content = @{$ref_dl2content};
1342 if ((scalar(@dl2content) > 0)
1343 && (ref ($dl2content[0]))
1344 && ($dl2content[0]->tag() eq "dd"))
1345 {
1346 # print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, <dl>, and <dd>\n";
1347 # print STDERR "CHANGING\n"; $he->dump();
1348 html_replace_by_ignore($dtcontent[$#dtcontent]);
1349 splice(@{$ref_dlcontent}, 1, 0, @dl2content);
1350 # print STDERR "CHANGED TO:\n"; $he->dump();
1351 return 0; # don't traverse children
1352 } } } } } } } } }
1353 return 1;
1354}
1355
1356
Fred Drake3fe1d321999-01-08 15:25:29 +00001357# If we find a paragraph that looks like
1358# <P>
1359# <HR>
1360# <UL>
1361# then accumulate its links into a contents_list and delete the paragraph.
1362sub process_if_child_links ( $$$ )
1363{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1364 if (!$startflag)
1365 { return; }
1366
1367 if ($he->tag() eq "p")
1368 { my $ref_content = $he->content();
1369 if (defined $ref_content)
1370 { my @content = @{$ref_content};
1371 if ((scalar(@content) == 2)
1372 && (ref $content[0]) && $content[0]->tag() eq "hr"
1373 && (ref $content[1]) && $content[1]->tag() eq "ul")
1374 { process_child_links($he);
1375 $he->delete();
1376 return 0; } } }
1377 return 1;
1378}
1379
1380
1381# If we find
1382# <H4>
1383# "Footnotes"
1384# <DL>
1385# <DT>
1386# <A NAME="foot560">
1387# "...borrow"
1388# <A HREF="refcountsInPython.html#tex2html2" NAME="foot560">
1389# "1.2"
1390# <DD>
1391# "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. "
1392# ...
1393# then record the footnote information and delete the section and list.
1394
1395my $process_if_footnotes_expect_dl_next = 0;
1396
1397sub process_if_footnotes ( $$$ )
1398{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1399 if (!$startflag)
1400 { return; }
1401
1402 if (($he->tag() eq "h4")
1403 && has_single_content_string($he)
1404 && ($ {$he->content}[0] eq "Footnotes"))
1405 { html_replace_by_ignore($he);
1406 $process_if_footnotes_expect_dl_next = 1;
1407 return 0; }
1408
1409 if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl"))
1410 { my $ref_content = $he->content();
1411 if (defined $ref_content)
1412 { $process_if_footnotes_expect_dl_next = 0;
1413 my @content = @{$ref_content};
1414 for (my $i=0; $i<$#content; $i+=2)
1415 { my $he_dt = $content[$i];
1416 my $he_dd = $content[$i+1];
1417 if (($he_dt->tag ne "dt") || ($he_dd->tag ne "dd"))
1418 { $he->dump;
1419 die "expected <DT> and <DD> at positions $i and ", $i+1; }
1420 my @dt_content = @{$he_dt->content()};
1421 if ((scalar(@dt_content) != 2)
1422 || ($dt_content[0]->tag ne "a")
1423 || ($dt_content[1]->tag ne "a"))
1424 { $he_dt->dump;
1425 die "Expected 2 anchors as content of <DT>"; }
1426 my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]);
1427 my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]);
1428 # unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content
1429 if ($dt1_name ne $dt2_name)
1430 { $he_dt->dump;
1431 die "Expected identical names for anchors"; }
1432 html_replace_by_ignore($he_dd);
1433 $he_dd->tag("div"); # has no effect
1434 $footnotes{$dt1_name} = $he_dd; }
1435 html_replace_by_ignore($he);
1436 return 0; } }
1437
1438 if ($process_if_footnotes_expect_dl_next)
1439 { $he->dump;
1440 die "Expected <DL> for footnotes next"; }
1441
1442 return 1;
1443}
1444
1445
1446
1447## Merge two adjacent paragraphs containing <DL> items, such as:
1448# <P>
1449# <DL>
1450# <DT>
1451# ...
1452# <DD>
1453# ...
1454# <P>
1455# <DL>
1456# <DT>
1457# ...
1458# <DD>
1459# ...
1460
1461sub merge_dl ( $$$ )
1462{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1463 if (!$startflag)
1464 { return; }
1465
1466 my $ref_content = $he->content;
1467 if (!defined $ref_content)
1468 { return; }
1469 my $i = 0;
1470 while ($i < scalar(@{$ref_content})-1)
1471 { my $p1 = $ {$ref_content}[$i];
1472 if ((ref $p1) && ($p1->tag eq "p")
1473 && has_single_content_with_tag($p1, "dl"))
1474 { my $dl1 = $ {$p1->content}[0];
1475 # In this loop, rhs, not lhs, of < comparison changes,
1476 # because we are removing elements from the content of $he.
1477 while ($i < scalar(@{$ref_content})-1)
1478 { my $p2 = $ {$ref_content}[$i+1];
1479 if (!((ref $p2) && ($p2->tag eq "p")
1480 && has_single_content_with_tag($p2, "dl")))
1481 { last; }
1482 # Merge these two elements.
1483 splice(@{$ref_content}, $i+1, 1); # remove $p2
1484 my $dl2 = $ {$p2->content}[0];
1485 $dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1
1486 }
1487 # extra increment because next element isn't a candidate for $p1
1488 $i++; }
1489 $i++; }
1490 return 1;
1491}
1492
1493
1494
1495###########################################################################
1496### Testing
1497###
1498
1499sub test ( $$ )
1500{ my ($action, $file) = check_args(2, @_);
1501
1502 # General testing
1503 if (($action eq "view") || ($action eq ""))
1504 { # # $file = "/homes/gws/mernst/www/links.html";
1505 # # $file = "/homes/gws/mernst/www/index.html";
1506 # # $file = "/homes/fish/mernst/java/gud/doc/manual.html";
1507 # # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html";
1508 # # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html";
1509 # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
1510 my $tree = file_to_tree($file);
1511
1512 ## Testing
1513 # print STDERR $tree->as_HTML;
1514 $tree->dump();
1515
1516 # print STDERR $tree->tag(), "\n";
1517 # print STDERR @{$tree->content()}, "\n";
1518 #
1519 # for (@{ $tree->extract_links(qw(a img)) }) {
1520 # my ($link, $linkelem) = @$_;
1521 # print STDERR "$link ", $linkelem->as_HTML;
1522 # }
1523 #
1524 # print STDERR @{$tree->extract_links()}, "\n";
1525
1526 # my @top_level_elts = @{$tree->content()};
1527
1528 # if scalar(@{$tree->content()})
1529 return;
1530 }
1531
1532 elsif ($action eq "raw")
1533 { my $tree = new HTML::TreeBuilder;
1534 $tree->ignore_unknown(1);
1535 # $tree->warn(1);
1536 $tree->parse_file($file);
1537
1538 $tree->dump();
1539
1540 # cleanup_parse_tree($tree);
1541 # $tree->dump();
1542 return;
1543 }
1544
1545 # Test dealing with a section.
1546 elsif ($action eq "section")
1547 { # my $file;
1548 # $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html";
1549 # $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html";
1550 # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
1551 process_section_file($file, 0, "Title");
1552 }
1553
1554 # Test dealing with many sections
1555 elsif (0)
1556 { my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html",
1557 "/homes/fish/mernst/tmp/python-doc/html/api/abstract.html",
1558 "/homes/fish/mernst/tmp/python-doc/html/api/api.html",
1559 "/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html",
1560 "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html",
1561 "/homes/fish/mernst/tmp/python-doc/html/api/concrete.html",
1562 # "/homes/fish/mernst/tmp/python-doc/html/api/contents.html",
1563 "/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html",
1564 "/homes/fish/mernst/tmp/python-doc/html/api/debugging.html",
1565 "/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html",
1566 "/homes/fish/mernst/tmp/python-doc/html/api/embedding.html",
1567 "/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html",
1568 "/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html",
1569 "/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html",
1570 "/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html",
1571 "/homes/fish/mernst/tmp/python-doc/html/api/front.html",
1572 "/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html",
1573 # "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html",
1574 "/homes/fish/mernst/tmp/python-doc/html/api/importing.html",
1575 "/homes/fish/mernst/tmp/python-doc/html/api/includes.html",
1576 "/homes/fish/mernst/tmp/python-doc/html/api/index.html",
1577 "/homes/fish/mernst/tmp/python-doc/html/api/initialization.html",
1578 "/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html",
1579 "/homes/fish/mernst/tmp/python-doc/html/api/intro.html",
1580 "/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html",
1581 "/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html",
1582 "/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html",
1583 "/homes/fish/mernst/tmp/python-doc/html/api/mapping.html",
1584 "/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html",
1585 "/homes/fish/mernst/tmp/python-doc/html/api/node24.html",
1586 "/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html",
1587 "/homes/fish/mernst/tmp/python-doc/html/api/number.html",
1588 "/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html",
1589 "/homes/fish/mernst/tmp/python-doc/html/api/object.html",
1590 "/homes/fish/mernst/tmp/python-doc/html/api/objects.html",
1591 "/homes/fish/mernst/tmp/python-doc/html/api/os.html",
1592 "/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html",
1593 "/homes/fish/mernst/tmp/python-doc/html/api/processControl.html",
1594 "/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html",
1595 "/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html",
1596 "/homes/fish/mernst/tmp/python-doc/html/api/sequence.html",
1597 "/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html",
1598 "/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html",
1599 "/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html",
1600 "/homes/fish/mernst/tmp/python-doc/html/api/threads.html",
1601 "/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html",
1602 "/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html",
1603 "/homes/fish/mernst/tmp/python-doc/html/api/types.html",
1604 "/homes/fish/mernst/tmp/python-doc/html/api/utilities.html",
1605 "/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html");
1606 for my $file (@files)
1607 { print STDERR "\n", "=" x 75, "\n", "$file:\n";
1608 process_section_file($file, 0, "Title");
1609 }
1610 }
1611
1612 # Test dealing with index.
1613 elsif ($action eq "index")
1614 { # my $file;
1615 # $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html";
1616
1617 process_index_file($file, "\@cindex");
1618 print_index_info();
1619 }
1620
1621 else
1622 { die "Unrecognized action `$action'"; }
1623}
1624
1625
1626###########################################################################
1627### Main loop
1628###
1629
1630sub process_contents_file ( $ )
1631{ my ($file) = check_args(1, @_);
1632
1633 # could also use File::Basename
1634 my $info_file = $file;
1635 $info_file =~ s/(\/?index)?\.html$//;
1636 if ($info_file eq "")
1637 { chomp($info_file = `pwd`); }
1638 $info_file =~ s/^.*\///; # not the most efficient way to remove dirs
1639
1640 $html_directory = $file;
1641 $html_directory =~ s/(\/|^)[^\/]+$/$1/;
1642
1643 my $texi_file = "$info_file.texi";
1644 open(TEXI, ">$texi_file");
1645
1646 print TEXI "\\input texinfo \@c -*-texinfo-*-\n";
1647 print TEXI "\@c %**start of header\n";
1648 print TEXI "\@setfilename $info_file\n";
1649
1650 # 2. Summary Description and Copyright
1651 # The "Summary Description and Copyright" segment describes the
1652 # document and contains the copyright notice and copying permissions
1653 # for the Info file. The segment must be enclosed between `@ifinfo'
1654 # and `@end ifinfo' commands so that the formatters place it only in
1655 # the Info file.
1656 #
1657 # The summary description and copyright segment does not appear in the
1658 # printed document.
1659 #
1660 # @ifinfo
1661 # This is a short example of a complete Texinfo file.
1662 #
1663 # Copyright @copyright{} 1990 Free Software Foundation, Inc.
1664 # @end ifinfo
1665
1666
1667 # 3. Title and Copyright
1668 # The "Title and Copyright" segment contains the title and copyright
1669 # pages and copying permissions for the printed manual. The segment
1670 # must be enclosed between `@titlepage' and `@end titlepage'
1671 # commands. The title and copyright page appear only in the printed
1672 # manual.
1673 #
1674 # The titlepage segment does not appear in the Info file.
1675 #
1676 # @titlepage
1677 # @sp 10
1678 # @comment The title is printed in a large font.
1679 # @center @titlefont{Sample Title}
1680 #
1681 # @c The following two commands start the copyright page.
1682 # @page
1683 # @vskip 0pt plus 1filll
1684 # Copyright @copyright{} 1990 Free Software Foundation, Inc.
1685 # @end titlepage
1686
1687
1688 # 4. `Top' Node and Master Menu
1689 # The "Master Menu" contains a complete menu of all the nodes in the
1690 # whole Info file. It appears only in the Info file, in the `Top'
1691 # node.
1692 #
1693 # The `Top' node contains the master menu for the Info file. Since a
1694 # printed manual uses a table of contents rather than a menu, the master
1695 # menu appears only in the Info file.
1696 #
1697 # @node Top, First Chapter, , (dir)
1698 # @comment node-name, next, previous, up
1699 #
1700 # @menu
1701 # * First Chapter:: The first chapter is the
1702 # only chapter in this sample.
1703 # * Concept Index:: This index has two entries.
1704 # @end menu
1705
1706
1707
1708 $current_ref_tdf = [ "Top", 0, $ARGV[0] ];
1709 process_section_file($file, 0, "Top");
1710 while (scalar(@contents_list))
1711 { $current_ref_tdf = shift @contents_list;
1712 process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]);
1713 }
1714
1715 print TEXI "\n";
1716 for my $indextitle (@index_titles)
1717 { print TEXI "\@node $indextitle\n";
1718 print TEXI "\@unnumbered $indextitle\n";
1719 print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n";
1720 print TEXI "\n"; }
1721
1722 print TEXI "\@contents\n";
1723 print TEXI "\@bye\n";
1724 close(TEXI);
1725}
1726
1727# This needs to be last so global variable initializations are reached.
1728
1729if (scalar(@ARGV) == 0)
1730{ die "No arguments supplied to html2texi.pl"; }
1731
1732if ($ARGV[0] eq "-test")
1733{ my @test_args = @ARGV[1..$#ARGV];
1734 if (scalar(@test_args) == 0)
1735 { test("", "index.html"); }
1736 elsif (scalar(@test_args) == 1)
1737 { test("", $test_args[0]); }
1738 elsif (scalar(@test_args) == 2)
1739 { test($test_args[0], $test_args[1]); }
1740 else
1741 { die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); }
1742 exit();
1743}
1744
1745if (scalar(@ARGV) != 1)
1746{ die "Pass one argument, the main/contents page"; }
1747
1748process_contents_file($ARGV[0]);
Fred Drake54bad441999-01-14 18:17:07 +00001749
1750# end of html2texi.pl