blob: 740a7e401f703aee6b7d4476a2883abfedc7c025 [file] [log] [blame]
Fred Drake3fe1d321999-01-08 15:25:29 +00001#! /usr/bin/env perl -w
2# html2texi.pl -- Convert HTML documentation to Texinfo format
3# Michael Ernst <mernst@cs.washington.edu>
4# Time-stamp: <1998-09-10 12:52:38 mernst>
5
6# This program converts HTML documentation trees into Texinfo format.
7# Given the name of a main (or contents) HTML file, it processes that file,
8# and other files (transitively) referenced by it, into a Texinfo file
9# (whose name is chosen from the file or directory name of the argument).
10# For instance:
11# html2texi.pl api/index.pl
12# produces file "api.texi".
13
14# Texinfo format can be easily converted to Info format (for browsing in
15# Emacs or the standalone Info browser), to a printed manual, or to HTML.
16# Thus, html2texi.pl permits conversion of HTML files to Info format, and
17# secondarily enables producing printed versions of Web page hierarchies.
18
19# Unlike HTML, Info format is searchable. Since Info is integrated into
20# Emacs, one can read documentation without starting a separate Web
21# browser. Additionally, Info browsers (including Emacs) contain
22# convenient features missing from Web browsers, such as easy index lookup
23# and mouse-free browsing.
24
25# Limitations:
26# html2texi.pl is currently tuned to latex2html output, but should be
27# extensible to arbitrary HTML documents. It will be most useful for HTML
28# with a hierarchical structure and an index. The HTML tree to be
29# traversed must be on local disk, rather than being accessed via HTTP.
30# This script requires the use of "checkargs.pm". To eliminate that
31# dependence, replace calls to check_args* by @_ (which is always the last
32# argument to those functions).
33# Also see the "to do" section, below.
34# Comments, suggestions, bug fixes, and enhancements are welcome.
35
36###
37### Typical usage for the Python documentation:
38###
39
40# (Actually, most of this is in a Makefile instead.)
41# The resulting Info format Python documentation is currently available at
42# ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz
43
44# Fix up HTML problems, eg <DL COMPACT><DD>
45
46# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html
47# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html
48# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html
49# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html
50# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html
51# html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html
52
53# Edit the generated .texi files:
54# * change @setfilename to prefix "python-"
55# * fix up any sectioning, such as for Abstract
56# * make Texinfo menus
57# * perhaps remove the @detailmenu ... @end detailmenu
58# In Emacs:
59# (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer))
60
61# makeinfo api.texi
62# makeinfo ext.texi
63# makeinfo lib.texi
64# makeinfo mac.texi
65# makeinfo ref.texi
66# makeinfo tut.texi
67
68
69###
70### Structure of the code
71###
72
73# To be written...
74
75
76###
77### Design decisions
78###
79
80# Source and destination languages
81# --------------------------------
82#
83# The goal is Info files; I create Texinfo, so I don't have to worry about
84# the finer details of Info file creation. (I'm not even sure of its exact
85# format.)
86#
87# Why not start from LaTeX rather than HTML?
88# I could hack latex2html itself to produce Texinfo instead, or fix up
89# partparse.py (which already translates LaTeX to Teinfo).
90# Pros:
91# * has high-level information such as index entries, original formatting
92# Cons:
93# * those programs are complicated to read and understand
94# * those programs try to handle arbitrary LaTeX input, track catcodes,
95# and more: I don't want to go to that effort. HTML isn't as powerful
96# as LaTeX, so there are fewer subtleties.
97# * the result wouldn't work for arbitrary HTML documents; it would be
98# nice to eventually extend this program to HTML produced from Docbook,
99# Frame, and more.
100
101# Parsing
102# -------
103#
104# I don't want to view the text as a linear stream; I'd rather parse the
105# whole thing and then do pattern matching over the parsed representation (to
106# find idioms such as indices, lists of child nodes, etc.).
107# * Perl provides HTML::TreeBuilder, which does just what I want.
108# * libwww-perl: http://www.linpro.no/lwp/
109# * TreeBuilder: HTML-Tree-0.51.tar.gz
110# * Python Parsers, Formatters, and Writers don't really provide the right
111# interface (and the version in Grail doesn't correspond to another
112# distributed version, so I'm confused about which to be using). I could
113# write something in Python that creates a parse tree, but why bother?
114
115# Other implementation language issues:
116# * Python lacks variable declarations, reasonable scoping, and static
117# checking tools. I've written some of the latter for myself that make
118# my Perl programming a lot safer than my Python programming will be until
119# I have a similar suite for that language.
120
121
122###########################################################################
123### To do
124###
125
126# Section names:
127# Fix the problem with multiple sections in a single file (eg, Abstract in
128# Front Matter section).
129# Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310
130# Index:
131# Perhaps double-check that every tag mentioned in the index is found
132# in the text.
133# Python: email to python-docs@python.org, to get their feedback.
134# Compare to existing lib/ Info manual
135# Write the hooks into info-look; replace pyliblookup1-1.tar.gz.
136# Postpass to remove extra quotation marks around typography already in
137# a different font (to avoid double delimiters as in "`code'"); or
138# perhaps consider using only font-based markup so that we don't get
139# the extra *bold* and `code' markup in Info.
140
141## Perhaps don't rely on automatic means for adding up, next, prev; I have
142## all that info available to me already, so it's not so much trouble to
143## add it. (Right?) But it is *so* easy to use Emacs instead...
144
145
146###########################################################################
147### Strictures
148###
149
150# man HTML::TreeBuilder
151# man HTML::Parser
152# man HTML::Element
153
154# require HTML::ParserWComment;
155require HTML::Parser;
156require HTML::TreeBuilder;
157require HTML::Element;
158
159use File::Basename;
160use Cwd;
161
162use strict;
163# use Carp;
164
165
166use checkargs;
167
168
169###########################################################################
170### Variables
171###
172
173my @section_stack = (); # elements are chapter/section/subsec nodetitles (I think)
174my $current_ref_tdf; # for the file currently being processed;
175 # used in error messages
176my $html_directory;
177my %footnotes;
178
179# First element should not be used.
180my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection");
181
182my %inline_markup = ("b" => "strong",
183 "code" => "code",
184 "i" => "emph",
185 "kbd" => "kbd",
186 "samp" => "samp",
187 "strong" => "strong",
188 "tt" => "code",
189 "var" => "var");
190
191my @deferred_index_entries = ();
192
193my @index_titles = (); # list of (filename, type) lists
194my %index_info = ("Index" => ["\@blindex", "bl"],
195 "Concept Index" => ["\@cindex", "cp"],
196 "Module Index" => ["\@mdindex", "md"]);
197
198
199###########################################################################
200### Main/contents page
201###
202
203# Process first-level page on its own, or just a contents page? Well, I do
204# want the title, author, etc., and the front matter... For now, just add
205# that by hand at the end.
206
207
208# data structure possibilities:
209# * tree-like (need some kind of stack when processing (or parent pointers))
210# * list of name and depth; remember old and new depths.
211
212# Each element is a reference to a list of (nodetitle, depth, filename).
213my @contents_list = ();
214
215# The problem with doing fixups on the fly is that some sections may have
216# already been processed (and no longer available) by the time we notice
217# others with the same name. It's probably better to fully construct the
218# contents list (reading in all files of interest) upfront; that will also
219# let me do a better job with cross-references, because again, all files
220# will already be read in.
221my %contents_hash = ();
222my %contents_fixups = ();
223
224my @current_contents_list = ();
225
226# Merge @current_contents_list into @contents_list,
227# and set @current_contents_list to be empty.
228sub merge_contents_lists ( )
229{ check_args(0, @_);
230
231 # Three possibilities:
232 # * @contents_list is empty: replace it by @current_contents_list.
233 # * prefixes of the two lists are identical: do nothing
234 # * @current_contents_list is all at lower level than $contents_list[0];
235 # prefix @contents_list by @current_contents_list
236
237 if (scalar(@current_contents_list) == 0)
238 { die "empty current_contents_list"; }
239
240 # if (scalar(@contents_list) == 0)
241 # { @contents_list = @current_contents_list;
242 # @current_contents_list = ();
243 # return; }
244
245 # if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1]))
246 # { unshift @contents_list, @current_contents_list;
247 # @current_contents_list = ();
248 # return; }
249
250 for (my $i=0; $i<scalar(@current_contents_list); $i++)
251 { my $ref_c_tdf = $current_contents_list[$i];
252 if ($i >= scalar(@contents_list))
253 { push @contents_list, $ref_c_tdf;
254 my $title = $ {$ref_c_tdf}[0];
255 if (defined $contents_hash{$title})
256 { $contents_fixups{$title} = 1; }
257 else
258 { $contents_hash{$title} = 1; }
259 next; }
260 my $ref_tdf = $contents_list[$i];
261 my ($title, $depth, $file) = @{$ref_tdf};
262 my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf};
263
264 if (($title ne $c_title)
265 && ($depth < $c_depth)
266 && ($file ne $c_file))
267 { splice @contents_list, $i, 0, $ref_c_tdf;
268 if (defined $contents_hash{$c_title})
269 { $contents_fixups{$c_title} = 1; }
270 else
271 { $contents_hash{$c_title} = 1; }
272 next; }
273
274 if (($title ne $c_title)
275 || ($depth != $c_depth)
276 || ($file ne $c_file))
277 { die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:",
278 "\n main: <<<$title>>> $depth $file",
279 "\n curr: <<<$c_title>>> $c_depth $c_file"); }
280 }
281 @current_contents_list = ();
282}
283
284
285
286# Set @current_contents_list to a list of (title, href, sectionlevel);
287# then merge that list into @contents_list.
288# Maybe this function should also produce a map
289# from title (or href) to sectionlevel (eg "chapter"?).
290sub process_child_links ( $ )
291{ my ($he) = check_args(1, @_);
292
293 # $he->dump;
294 if (scalar(@current_contents_list) != 0)
295 { die "current_contents_list nonempty: @current_contents_list"; }
296 $he->traverse(\&increment_current_contents_list, 'ignore text');
297
298 # Normalize the depths; for instance, convert 1,3,5 into 0,1,2.
299 my %depths = ();
300 for my $ref_tdf (@current_contents_list)
301 { $depths{$ {$ref_tdf}[1]} = 1; }
302 my @sorted_depths = sort keys %depths;
303 my $current_depth = scalar(@section_stack)-1;
304 my $current_depth_2 = $ {$current_ref_tdf}[1];
305 if ($current_depth != $current_depth_2)
306 { die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); }
307 for (my $i=0; $i<scalar(@sorted_depths); $i++)
308 { $depths{$sorted_depths[$i]} = $i + $current_depth+1; }
309 for my $ref_tdf (@current_contents_list)
310 { $ {$ref_tdf}[1] = $depths{$ {$ref_tdf}[1]}; }
311
312 # Eliminate uninteresting sections. Hard-coded hack for now.
313 if ($ {$current_contents_list[-1]}[0] eq "About this document ...")
314 { pop @current_contents_list; }
315 if ((scalar(@current_contents_list) > 1)
316 && ($ {$current_contents_list[1]}[0] eq "Contents"))
317 { my $ref_first_tdf = shift @current_contents_list;
318 $current_contents_list[0] = $ref_first_tdf; }
319
320 for (my $i=0; $i<scalar(@current_contents_list); $i++)
321 { my $ref_tdf = $current_contents_list[$i];
322 my $title = $ {$ref_tdf}[0];
323 if (exists $index_info{$title})
324 { my $index_file = $ {$ref_tdf}[2];
325 my ($indexing_command, $suffix) = @{$index_info{$title}};
326 process_index_file($index_file, $indexing_command);
327 print TEXI "\n\@defindex $suffix\n";
328 push @index_titles, $title;
329 splice @current_contents_list, $i, 1;
330 $i--; }
331 elsif ($title =~ /\bIndex$/)
332 { print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n"; } }
333
334 merge_contents_lists();
335
336 # print_contents_list();
337 # print_index_info();
338}
339
340
341sub increment_current_contents_list ( $$$ )
342{ my ($he, $startflag, $depth) = check_args(3, @_);
343 if (!$startflag)
344 { return; }
345
346 if ($he->tag eq "li")
347 { my @li_content = @{$he->content};
348 if ($li_content[0]->tag ne "a")
349 { die "first element of <LI> should be <A>"; }
350 my ($name, $href, @content) = anchor_info($li_content[0]);
351 # unused $name
352 my $title = join("", collect_texts($li_content[0]));
353 $title = texi_remove_punctuation($title);
354 # The problem with these is that they are formatted differently in
355 # @menu and @node!
356 $title =~ s/``/\"/g;
357 $title =~ s/''/\"/g;
358 $title =~ s/ -- / /g;
359 push @current_contents_list, [ $title, $depth, $href ]; }
360 return 1;
361}
362
363# Simple version for section titles
364sub html_to_texi ( $ )
365{ my ($he) = check_args(1, @_);
366 if (!ref $he)
367 { return $he; }
368
369 my $tag = $he->tag;
370 if (exists $inline_markup{$tag})
371 { my $result = "\@$inline_markup{$tag}\{";
372 for my $elt (@{$he->content})
373 { $result .= html_to_texi($elt); }
374 $result .= "\}";
375 return $result; }
376 else
377 { $he->dump;
378 die "html_to_texi confused by <$tag>"; }
379}
380
381
382
383sub print_contents_list ()
384{ check_args(0, @_);
385 print STDERR "Contents list:\n";
386 for my $ref_tdf (@contents_list)
387 { my ($title, $depth, $file) = @{$ref_tdf};
388 print STDERR "$title $depth $file\n"; }
389}
390
391
392
393###########################################################################
394### Index
395###
396
397my $l2h_broken_link_name = "l2h-";
398
399
400# map from file to (map from anchor name to (list of index texts))
401# (The list is needed when a single LaTeX command like \envvar
402# expands to multiple \index commands.)
403my %file_index_entries = ();
404my %this_index_entries; # map from anchor name to (list of index texts)
405
406my %file_index_entries_broken = (); # map from file to (list of index texts)
407my @this_index_entries_broken;
408
409my $index_prefix = "";
410my @index_prefixes = ();
411
412my $this_indexing_command;
413
414sub print_index_info ()
415{ check_args(0, @_);
416 my ($key, $val);
417 for my $file (sort keys %file_index_entries)
418 { my %index_entries = %{$file_index_entries{$file}};
419 print STDERR "file: $file\n";
420 for my $aname (sort keys %index_entries)
421 { my @entries = @{$index_entries{$aname}};
422 if (scalar(@entries) == 1)
423 { print STDERR " $aname : $entries[0]\n"; }
424 else
425 { print STDERR " $aname : ", join("\n " . (" " x length($aname)), @entries), "\n"; } } }
426 for my $file (sort keys %file_index_entries_broken)
427 { my @entries = @{$file_index_entries_broken{$file}};
428 print STDERR "file: $file\n";
429 for my $entry (@entries)
430 { print STDERR " $entry\n"; }
431 }
432}
433
434
435sub process_index_file ( $$ )
436{ my ($file, $indexing_command) = check_args(2, @_);
437 # print "process_index_file $file $indexing_command\n";
438
439 my $he = file_to_tree($html_directory . $file);
440 # $he->dump();
441
442 $this_indexing_command = $indexing_command;
443 $he->traverse(\&process_if_index_dl_compact, 'ignore text');
444 undef $this_indexing_command;
445 # print "process_index_file done\n";
446}
447
448
449sub process_if_index_dl_compact ( $$$ )
450{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
451 if (!$startflag)
452 { return; }
453
454 if (($he->tag() eq "dl") && (defined $he->attr('compact')))
455 { process_index_dl_compact($he);
456 return 0; }
457 else
458 { return 1; }
459}
460
461
462# The elements of a <DL COMPACT> list from a LaTeX2HTML index:
463# * a single space: text to be ignored
464# * <DT> elements with an optional <DD> element following each one
465# Two types of <DT> elements:
466# * Followed by a <DD> element: the <DT> contains a single
467# string, and the <DD> contains a whitespace string to be ignored, a
468# <DL COMPACT> to be recursively processed (with the <DT> string as a
469# prefix), and a whitespace string to be ignored.
470# * Not followed by a <DD> element: contains a list of anchors
471# and texts (ignore the texts, which are only whitespace and commas).
472# Optionally contains a <DL COMPACT> to be recursively processed (with
473# the <DT> string as a prefix)
474sub process_index_dl_compact ( $ )
475{ my ($h) = check_args(1, @_);
476 my @content = @{$h->content()};
477 for (my $i = 0; $i < scalar(@content); $i++)
478 { my $this_he = $content[$i];
479 if ($this_he->tag ne "dt")
480 { $this_he->dump;
481 die "Expected <DT> tag: " . $this_he->tag; }
482 if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd"))
483 { process_index_dt_and_dd($this_he, $content[$i+1]);
484 $i++; }
485 else
486 { process_index_lone_dt($this_he); } } }
487
488
489
490# Argument is a <DT> element. If it contains more than one anchor, then
491# the texts of all subsequent ones are "[Link]". Example:
492# <DT>
493# <A HREF="embedding.html#l2h-201">
494# "$PATH"
495# ", "
496# <A HREF="embedding.html#l2h-205">
497# "[Link]"
498# Optionally contains a <DL COMPACT> as well. Example:
499# <DT>
500# <A HREF="types.html#l2h-616">
501# "attribute"
502# <DL COMPACT>
503# <DT>
504# <A HREF="assignment.html#l2h-3074">
505# "assignment"
506# ", "
507# <A HREF="assignment.html#l2h-3099">
508# "[Link]"
509# <DT>
510# <A HREF="types.html#l2h-">
511# "assignment, class"
512
513sub process_index_lone_dt ( $ )
514{ my ($dt) = check_args(1, @_);
515 my @dtcontent = @{$dt->content()};
516 my $acontent;
517 my $acontent_suffix;
518 for my $a (@dtcontent)
519 { if ($a eq ", ")
520 { next; }
521 if (!ref $a)
522 { $dt->dump;
523 die "Unexpected <DT> string element: $a"; }
524
525 if ($a->tag eq "dl")
526 { push @index_prefixes, $index_prefix;
527 if (!defined $acontent_suffix)
528 { die "acontent_suffix not yet defined"; }
529 $index_prefix .= $acontent_suffix . ", ";
530 process_index_dl_compact($a);
531 $index_prefix = pop(@index_prefixes);
532 return; }
533
534 if ($a->tag ne "a")
535 { $dt->dump;
536 $a->dump;
537 die "Expected anchor in lone <DT>"; }
538
539 my ($aname, $ahref, @acontent) = anchor_info($a);
540 # unused $aname
541 if (scalar(@acontent) != 1)
542 { die "Expected just one content of <A> in <DT>: @acontent"; }
543 if (ref $acontent[0])
544 { $acontent[0]->dump;
545 die "Expected string content of <A> in <DT>: $acontent[0]"; }
546 if (!defined($acontent))
547 { $acontent = $index_prefix . $acontent[0];
548 $acontent_suffix = $acontent[0]; }
549 elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0])))
550 { die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; }
551
552 if (!defined $ahref)
553 { $dt->dump;
554 die "no HREF in nachor in <DT>"; }
555 my ($ahref_file, $ahref_name) = split(/\#/, $ahref);
556 if (!defined $ahref_name)
557 { # Reference to entire file
558 $ahref_name = ""; }
559
560 if ($ahref_name eq $l2h_broken_link_name)
561 { if (!exists $file_index_entries_broken{$ahref_file})
562 { $file_index_entries_broken{$ahref_file} = []; }
563 push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent";
564 next; }
565
566 if (!exists $file_index_entries{$ahref_file})
567 { $file_index_entries{$ahref_file} = {}; }
568 # Don't do this! It appears to make a copy, which is not desired.
569 # my %index_entries = %{$file_index_entries{$ahref_file}};
570 if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name})
571 { $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; }
572 # { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name};
573 # if ($acontent eq $oldcontent)
574 # { die "Multiple identical index entries?"; }
575 # die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; }
576
577 push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent";
578 # print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n";
579 }
580}
581
582sub process_index_dt_and_dd ( $$ )
583{ my ($dt, $dd) = check_args(2, @_);
584 my $dtcontent;
585 { my @dtcontent = @{$dt->content()};
586 if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0]))
587 { $dd->dump;
588 $dt->dump;
589 die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of <DT>: @dtcontent"; }
590 $dtcontent = $dtcontent[0];
591 $dtcontent =~ s/ +$//; }
592 my $ddcontent;
593 { my @ddcontent = @{$dd->content()};
594 if (scalar(@ddcontent) != 1)
595 { die "Expected single <DD> content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; }
596 $ddcontent = $ddcontent[0]; }
597 if ($ddcontent->tag ne "dl")
598 { die "Expected <DL> as content of <DD>, but saw: $ddcontent"; }
599
600 push @index_prefixes, $index_prefix;
601 $index_prefix .= $dtcontent . ", ";
602 process_index_dl_compact($ddcontent);
603 $index_prefix = pop(@index_prefixes);
604}
605
606
607###########################################################################
608### Ordinary sections
609###
610
611sub process_section_file ( $$$ )
612{ my ($file, $depth, $nodetitle) = check_args(3, @_);
613 my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file);
614
615 # print STDERR "process_section_file: $file $depth $nodetitle\n";
616
617 # Equivalently:
618 # while ($depth >= scalar(@section_stack)) { pop(@section_stack); }
619 @section_stack = @section_stack[0..$depth-1];
620
621 # Not a great nodename fixup scheme; need a more global view
622 if ((defined $contents_fixups{$nodetitle})
623 && (scalar(@section_stack) > 0))
624 { my $up_title = $section_stack[$#section_stack];
625 # hack for Python Standard Library
626 $up_title =~ s/^(Built-in|Standard) Module //g;
627 my ($up_first_word) = split(/ /, $up_title);
628 $nodetitle = "$up_first_word $nodetitle";
629 }
630
631 push @section_stack, $nodetitle;
632 # print STDERR "new section_stack: ", join(", ", @section_stack), "\n";
633
634 $he->traverse(\&process_if_child_links, 'ignore text');
635 %footnotes = ();
636 # $he->dump;
637 $he->traverse(\&process_if_footnotes, 'ignore text');
638
639 # $he->dump;
640
641 if (exists $file_index_entries{$file})
642 { %this_index_entries = %{$file_index_entries{$file}};
643 # print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n";
644 }
645 else
646 { # print STDERR "Warning: no index entries for file $file\n";
647 %this_index_entries = (); }
648
649 if (exists $file_index_entries_broken{$file})
650 { @this_index_entries_broken = @{$file_index_entries_broken{$file}}; }
651 else
652 { # print STDERR "Warning: no index entries for file $file\n";
653 @this_index_entries_broken = (); }
654
655
656 if ($he->tag() ne "html")
657 { die "Expected <HTML> at top level"; }
658 my @content = @{$he->content()};
659 if ((!ref $content[0]) or ($content[0]->tag ne "head"))
660 { $he->dump;
661 die "<HEAD> not first element of <HTML>"; }
662 if ((!ref $content[1]) or ($content[1]->tag ne "body"))
663 { $he->dump;
664 die "<BODY> not second element of <HTML>"; }
665
666 $content[1]->traverse(\&output_body);
667}
668
669# stack of things we're inside that are preventing indexing from occurring now.
670# These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?)
671my @index_deferrers = ();
672
673sub push_or_pop_index_deferrers ( $$ )
674{ my ($tag, $startflag) = check_args(2, @_);
675 if ($startflag)
676 { push @index_deferrers, $tag; }
677 else
678 { my $old_deferrer = pop @index_deferrers;
679 if ($tag ne $old_deferrer)
680 { die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); }
681 do_deferred_index_entries(); }
682}
683
684
685sub label_add_index_entries ( $;$ )
686{ my ($label, $he) = check_args_range(1, 2, @_);
687 # print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n";
688 # $he is the anchor element
689 if (exists $this_index_entries{$label})
690 { push @deferred_index_entries, @{$this_index_entries{$label}};
691 return; }
692
693 if ($label eq $l2h_broken_link_name)
694 { # Try to find some text to use in guessing which links should point here
695 # I should probably only look at the previous element, or if that is
696 # all punctuation, the one before it; collecting all the previous texts
697 # is a bit of overkill.
698 my @anchor_texts = collect_texts($he);
699 my @previous_texts = collect_texts($he->parent, $he);
700 # 4 elements is arbitrary; ought to filter out punctuation and small words
701 # first, then perhaps keep fewer. Perhaps also filter out formatting so
702 # that we can see a larger chunk of text? (Probably not.)
703 # Also perhaps should do further chunking into words, in case the
704 # index term isn't a chunk of its own (eg, was in <tt>...</tt>.
705 my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]);
706
707 my $guessed = 0;
708 for my $text (@candidate_texts)
709 { # my $orig_text = $text;
710 if ($text =~ /^[\"\`\'().?! ]*$/)
711 { next; }
712 if (length($text) <= 2)
713 { next; }
714 # hack for Python manual; maybe defer until failure first time around?
715 $text =~ s/^sys\.//g;
716 for my $iterm (@this_index_entries_broken)
717 { # I could test for zero: LaTeX2HTML's failures in the Python
718 # documentation are only for items of the form "... (built-in...)"
719 if (index($iterm, $text) != -1)
720 { push @deferred_index_entries, $iterm;
721 # print STDERR "Guessing index term `$iterm' for text `$orig_text'\n";
722 $guessed = 1;
723 } } }
724 if (!$guessed)
725 { # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n";
726 }
727 }
728}
729
730
731# Need to add calls to this at various places.
732# Perhaps add HTML::Element argument and do the check for appropriateness
733# here (ie, no action if inside <H1>, etc.).
734sub do_deferred_index_entries ()
735{ check_args(0, @_);
736 if ((scalar(@deferred_index_entries) > 0)
737 && (scalar(@index_deferrers) == 0))
738 { print TEXI "\n", join("\n", @deferred_index_entries), "\n";
739 @deferred_index_entries = (); }
740}
741
742my $table_columns; # undefined if not in a table
743my $table_first_column; # boolean
744
745sub output_body ( $$$ )
746{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
747
748 if (!ref $he)
749 { my $space_index = index($he, " ");
750 if ($space_index != -1)
751 { # Why does
752 # print TEXI texi_quote(substr($he, 0, $space_index+1));
753 # give: Can't locate object method "TEXI" via package "texi_quote"
754 # (Because the definition texi_quote hasn't been seen yet.)
755 print TEXI &texi_quote(substr($he, 0, $space_index+1));
756 do_deferred_index_entries();
757 print TEXI &texi_quote(substr($he, $space_index+1)); }
758 else
759 { print TEXI &texi_quote($he); }
760 return; }
761
762 my $tag = $he->tag();
763
764 # Ordinary text markup first
765 if (exists $inline_markup{$tag})
766 { if ($startflag)
767 { print TEXI "\@$inline_markup{$tag}\{"; }
768 else
769 { print TEXI "\}"; } }
770 elsif ($tag eq "a")
771 { my ($name, $href, @content) = anchor_info($he);
772 if (!$href)
773 { # This anchor is only here for indexing/cross referencing purposes.
774 if ($startflag)
775 { label_add_index_entries($name, $he); }
776 }
777 elsif ($href =~ "^(ftp|http|news):")
778 { if ($startflag)
779 { # Should avoid second argument if it's identical to the URL.
780 print TEXI "\@uref\{$href, "; }
781 else
782 { print TEXI "\}"; }
783 }
784 elsif ($href =~ /^\#(foot[0-9]+)$/)
785 { # Footnote
786 if ($startflag)
787 { # Could double-check name and content, but I'm not
788 # currently storing that information.
789 print TEXI "\@footnote\{";
790 $footnotes{$1}->traverse(\&output_body);
791 print TEXI "\}";
792 return 0; } }
793 else
794 { if ($startflag)
795 { $he->dump;
796 warn "Can't deal with internal HREF anchors yet"; }
797 }
798 }
Fred Drake8e2c9451999-01-11 22:30:34 +0000799 elsif ($tag eq "address")
800 { # this is part of the page footer, ignore
801 return 0;
802 }
Fred Drake3fe1d321999-01-08 15:25:29 +0000803 elsif ($tag eq "br")
804 { print TEXI "\@\n"; }
805 elsif ($tag eq "body")
806 { }
807 elsif ($tag eq "center")
808 { if (has_single_content_string($he)
809 && ($ {$he->content}[0] =~ /^ *$/))
810 { return 0; }
811 if ($startflag)
812 { print TEXI "\n\@center\n"; }
813 else
814 { print TEXI "\n\@end center\n"; }
815 }
816 elsif ($tag eq "div")
817 { my $align = $he->attr('align');
818 if (defined($align) && ($align eq "center"))
819 { if (has_single_content_string($he)
820 && ($ {$he->content}[0] =~ /^ *$/))
821 { return 0; }
822 if ($startflag)
823 { print TEXI "\n\@center\n"; }
824 else
825 { print TEXI "\n\@end center\n"; } }
826 }
827 elsif ($tag eq "dl")
828 { # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example"
829 if (has_single_content_with_tag($he, "dd"))
830 { my $he_dd = $ {$he->content}[0];
831 if (has_single_content_with_tag($he_dd, "pre"))
832 { my $he_pre = $ {$he_dd->content}[0];
833 print_pre($he_pre);
834 return 0; } }
835 if ($startflag)
836 { # Could examine the elements, to be cleverer about formatting.
837 # (Also to use ftable, vtable...)
838 print TEXI "\n\@table \@asis\n"; }
839 else
840 { print TEXI "\n\@end table\n"; }
841 }
842 elsif ($tag eq "dt")
843 { push_or_pop_index_deferrers($tag, $startflag);
844 if ($startflag)
845 { print TEXI "\n\@item "; }
846 else
847 { } }
848 elsif ($tag eq "dd")
849 { if ($startflag)
850 { print TEXI "\n"; }
851 else
852 { }
853 if (scalar(@index_deferrers) != 0)
854 { $he->dump;
855 die "index deferrers: ", join(" ", @index_deferrers); }
856 do_deferred_index_entries();
857 }
858 elsif ($tag =~ /^(font|big|small)$/)
859 { # Do nothing for now.
860 }
861 elsif ($tag =~ /^h[1-6]$/)
862 { # We don't need this because we never recursively enter the heading content.
863 # push_or_pop_index_deferrers($tag, $startflag);
864 my $secname = "";
865 my @seclabels = ();
866 for my $elt (@{$he->content})
867 { if (!ref $elt)
868 { $secname .= $elt; }
869 elsif ($elt->tag eq "br")
870 { }
871 elsif ($elt->tag eq "a")
872 { my ($name, $href, @acontent) = anchor_info($elt);
873 if ($href)
874 { $he->dump;
875 $elt->dump;
876 die "Nonsimple anchor in <$tag>"; }
877 if (!defined $name)
878 { die "No NAME for anchor in $tag"; }
879 push @seclabels, $name;
880 for my $subelt (@acontent)
881 { $secname .= html_to_texi($subelt); } }
882 else
883 { $secname .= html_to_texi($elt); } }
884 if ($secname eq "")
885 { die "No section name in <$tag>"; }
886 if (scalar(@section_stack) == 1)
887 { if ($section_stack[-1] ne "Top")
888 { die "Not top? $section_stack[-1]"; }
889 print TEXI "\@settitle $secname\n";
890 print TEXI "\@c %**end of header\n";
891 print TEXI "\n";
892 print TEXI "\@node Top\n";
893 print TEXI "\n"; }
894 else
895 { print TEXI "\n\@node $section_stack[-1]\n";
896 print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; }
897 for my $seclabel (@seclabels)
898 { label_add_index_entries($seclabel); }
899 # This should only happen once per file.
900 label_add_index_entries("");
901 if (scalar(@index_deferrers) != 0)
902 { die "index deferrers: ", join(" ", @index_deferrers); }
903 do_deferred_index_entries();
904 return 0;
905 }
906 elsif ($tag eq "hr")
907 { }
908 elsif ($tag eq "ignore")
909 { # Hack for ignored elements
910 return 0;
911 }
912 elsif ($tag eq "li")
913 { if ($startflag)
914 { print TEXI "\n\n\@item\n";
915 do_deferred_index_entries(); } }
916 elsif ($tag eq "ol")
917 { if ($startflag)
918 { print TEXI "\n\@enumerate \@bullet\n"; }
919 else
920 { print TEXI "\n\@end enumerate\n"; } }
921 elsif ($tag eq "p")
922 { if ($startflag)
923 { print TEXI "\n\n"; }
924 if (scalar(@index_deferrers) != 0)
925 { die "index deferrers: ", join(" ", @index_deferrers); }
926 do_deferred_index_entries(); }
927 elsif ($tag eq "pre")
928 { print_pre($he);
929 return 0; }
930 elsif ($tag eq "table")
931 { # Could also indicate common formatting for first column, or
932 # determine relative widths for columns (or determine a prototype row)
933 if ($startflag)
934 { if (defined $table_columns)
935 { $he->dump;
936 die "Can't deal with table nested inside $table_columns-column table"; }
937 $table_columns = table_columns($he);
938 if ($table_columns < 2)
939 { $he->dump;
940 die "Column with $table_columns columns?"; }
941 elsif ($table_columns == 2)
942 { print TEXI "\n\@table \@asis\n"; }
943 else
944 { print TEXI "\n\@multitable \@columnfractions";
945 for (my $i=0; $i<$table_columns; $i++)
946 { print TEXI " ", 1.0/$table_columns; }
947 print TEXI "\n"; } }
948 else
949 { if ($table_columns == 2)
950 { print TEXI "\n\@end table\n"; }
951 else
952 { print TEXI "\n\@end multitable\n"; }
953 undef $table_columns; } }
954 elsif (($tag eq "td") || ($tag eq "th"))
955 { if ($startflag)
956 { if ($table_first_column)
957 { print TEXI "\n\@item ";
958 $table_first_column = 0; }
959 elsif ($table_columns > 2)
960 { print TEXI "\n\@tab "; } }
961 else
962 { print TEXI "\n"; } }
963 elsif ($tag eq "tr")
964 { if ($startflag)
965 { $table_first_column = 1; } }
966 elsif ($tag eq "ul")
967 { if ($startflag)
968 { print TEXI "\n\@itemize \@bullet\n"; }
969 else
970 { print TEXI "\n\@end itemize\n"; } }
971 else
972 { print STDERR "\nBailing out\n";
973 $he->dump;
974 return 0; }
975
976 return 1;
977}
978
979sub print_pre ( $ )
980{ my ($he_pre) = check_args(1, @_);
981 if (!has_single_content_string($he_pre))
982 { die "Multiple or non-string content for <PRE>: ", @{$he_pre->content}; }
983 my $pre_content = $ {$he_pre->content}[0];
984 print TEXI "\n\@example";
985 print TEXI &texi_quote($pre_content);
986 print TEXI "\@end example\n";
987}
988
989sub table_columns ( $ )
990{ my ($table) = check_args(1, @_);
991 my $result = 0;
992 for my $row (@{$table->content})
993 { if ($row->tag ne "tr")
994 { $table->dump;
995 $row->dump;
996 die "Expected <TR> as table row."; }
997 $result = max($result, scalar(@{$row->content})); }
998 return $result;
999}
1000
1001
1002###########################################################################
1003### Utilities
1004###
1005
1006sub min ( $$ )
1007{ my ($x, $y) = check_args(2, @_);
1008 return ($x < $y) ? $x : $y;
1009}
1010
1011sub max ( $$ )
1012{ my ($x, $y) = check_args(2, @_);
1013 return ($x > $y) ? $x : $y;
1014}
1015
1016sub file_to_tree ( $ )
1017{ my ($file) = check_args(1, @_);
1018
1019 my $tree = new HTML::TreeBuilder;
1020 $tree->ignore_unknown(1);
1021 # $tree->warn(1);
1022 $tree->parse_file($file);
1023 cleanup_parse_tree($tree);
1024 return $tree
1025}
1026
1027
1028sub has_single_content ( $ )
1029{ my ($he) = check_args(1, @_);
1030 if (!ref $he)
1031 { # return 0;
1032 die "Non-reference argument: $he"; }
1033 my $ref_content = $he->content;
1034 if (!defined $ref_content)
1035 { return 0; }
1036 my @content = @{$ref_content};
1037 if (scalar(@content) != 1)
1038 { return 0; }
1039 return 1;
1040}
1041
1042
1043# Return true if the content of the element contains only one element itself,
1044# and that inner element has the specified tag.
1045sub has_single_content_with_tag ( $$ )
1046{ my ($he, $tag) = check_args(2, @_);
1047 if (!has_single_content($he))
1048 { return 0; }
1049 my $content = $ {$he->content}[0];
1050 if (!ref $content)
1051 { return 0; }
1052 my $content_tag = $content->tag;
1053 if (!defined $content_tag)
1054 { return 0; }
1055 return $content_tag eq $tag;
1056}
1057
1058sub has_single_content_string ( $ )
1059{ my ($he) = check_args(1, @_);
1060 if (!has_single_content($he))
1061 { return 0; }
1062 my $content = $ {$he->content}[0];
1063 if (ref $content)
1064 { return 0; }
1065 return 1;
1066}
1067
1068
1069# Return name, href, content. First two may be undefined; third is an array.
1070# I don't see how to determine if there are more attributes.
1071sub anchor_info ( $ )
1072{ my ($he) = check_args(1, @_);
1073 if ($he->tag ne "a")
1074 { $he->dump;
1075 die "passed non-anchor to anchor_info"; }
1076 my $name = $he->attr('name');
1077 my $href = $he->attr('href');
1078 my @content = ();
1079 { my $ref_content = $he->content;
1080 if (defined $ref_content)
1081 { @content = @{$ref_content}; } }
1082 return ($name, $href, @content);
1083}
1084
1085
1086sub texi_quote ( $ )
1087{ my ($text) = check_args(1, @_);
1088 $text =~ s/([\@\{\}])/\@$1/g;
1089 $text =~ s/ -- / --- /g;
1090 return $text;
1091}
1092
1093# Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles.
1094sub texi_remove_punctuation ( $ )
1095{ my ($text) = check_args(1, @_);
1096
1097 $text =~ s/^ +//g;
1098 $text =~ s/[ :]+$//g;
1099 $text =~ s/^[1-9][0-9.]* +//g;
1100 $text =~ s/,//g;
1101 # Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- "
1102 # gets converted into " - ", just as "---" would be converted into " -- ",
1103 # so the names end up differing.)
1104 # $text =~ s/:/ -- /g;
1105 $text =~ s/://g;
1106 return $text;
1107}
1108
1109
1110## Do not use this inside `traverse': it throws off the traversal. Use
1111## html_replace_by_ignore or html_replace_by_meta instead.
1112# Returns 1 if success, 0 if failure.
1113sub html_remove ( $;$ )
1114{ my ($he, $parent) = check_args_range(1, 2, @_);
1115 if (!defined $parent)
1116 { $parent = $he->parent; }
1117 my $ref_pcontent = $parent->content;
1118 my @pcontent = @{$ref_pcontent};
1119 for (my $i=0; $i<scalar(@pcontent); $i++)
1120 { if ($pcontent[$i] eq $he)
1121 { splice @{$ref_pcontent}, $i, 1;
1122 $he->parent(undef);
1123 return 1; } }
1124 die "Didn't find $he in $parent";
1125}
1126
1127
1128sub html_replace ( $$;$ )
1129{ my ($orig, $new, $parent) = check_args_range(2, 3, @_);
1130 if (!defined $parent)
1131 { $parent = $orig->parent; }
1132 my $ref_pcontent = $parent->content;
1133 my @pcontent = @{$ref_pcontent};
1134 for (my $i=0; $i<scalar(@pcontent); $i++)
1135 { if ($pcontent[$i] eq $orig)
1136 { $ {$ref_pcontent}[$i] = $new;
1137 $new->parent($parent);
1138 $orig->parent(undef);
1139 return 1; } }
1140 die "Didn't find $orig in $parent";
1141}
1142
1143sub html_replace_by_meta ( $;$ )
1144{ my ($orig, $parent) = check_args_range(1, 2, @_);
1145 my $meta = new HTML::Element "meta";
1146 if (!defined $parent)
1147 { $parent = $orig->parent; }
1148 return html_replace($orig, $meta, $parent);
1149}
1150
1151sub html_replace_by_ignore ( $;$ )
1152{ my ($orig, $parent) = check_args_range(1, 2, @_);
1153 my $ignore = new HTML::Element "ignore";
1154 if (!defined $parent)
1155 { $parent = $orig->parent; }
1156 return html_replace($orig, $ignore, $parent);
1157}
1158
1159
1160
1161###
1162### Collect text elements
1163###
1164
1165my @collected_texts;
1166my $collect_texts_stoppoint;
1167my $done_collecting;
1168
1169sub collect_texts ( $;$ )
1170{ my ($root, $stop) = check_args_range(1, 2, @_);
1171 # print STDERR "collect_texts: $root $stop\n";
1172 $collect_texts_stoppoint = $stop;
1173 $done_collecting = 0;
1174 @collected_texts = ();
1175 $root->traverse(\&collect_if_text); # process texts
1176 # print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n";
1177 return @collected_texts;
1178}
1179
1180sub collect_if_text ( $$$ )
1181{ my $he = (check_args(3, @_))[0]; # ignore depth and startflag arguments
1182 if ($done_collecting)
1183 { return 0; }
1184 if (!defined $he)
1185 { return 0; }
1186 if (!ref $he)
1187 { push @collected_texts, $he;
1188 return 0; }
1189 if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint))
1190 { $done_collecting = 1;
1191 return 0; }
1192 return 1;
1193}
1194
1195
1196###########################################################################
1197### Clean up parse tree
1198###
1199
1200sub cleanup_parse_tree ( $ )
1201{ my ($he) = check_args(1, @_);
1202 $he->traverse(\&delete_if_navigation, 'ignore text');
1203 $he->traverse(\&delete_extra_spaces, 'ignore text');
1204 $he->traverse(\&merge_dl, 'ignore text');
1205 return $he;
1206}
1207
1208
1209## Simpler version that deletes contents but not the element itself.
1210# sub delete_if_navigation ( $$$ )
1211# { my $he = (check_args(3, @_))[0]; # ignore startflag and depth
1212# if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation'))
1213# { $he->delete();
1214# return 0; }
1215# else
1216# { return 1; }
1217# }
1218
1219sub delete_if_navigation ( $$$ )
1220{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1221 if (!$startflag)
1222 { return; }
1223
1224 if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation'))
1225 { my $ref_pcontent = $he->parent()->content();
1226 # Don't try to modify @pcontent, which appears to be a COPY.
1227 # my @pcontent = @{$ref_pcontent};
1228 for (my $i = 0; $i<scalar(@{$ref_pcontent}); $i++)
1229 { if (${$ref_pcontent}[$i] eq $he)
1230 { splice(@{$ref_pcontent}, $i, 1);
1231 last; } }
1232 $he->delete();
1233 return 0; }
1234 else
1235 { return 1; }
1236}
1237
1238sub delete_extra_spaces ( $$$ )
1239{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1240 if (!$startflag)
1241 { return; }
1242
1243 my $tag = $he->tag;
1244 if ($tag =~ /^(head|html|table|tr|ul)$/)
1245 { delete_child_spaces($he); }
1246 delete_trailing_spaces($he);
1247 return 1;
1248}
1249
1250
1251sub delete_child_spaces ( $ )
1252{ my ($he) = check_args(1, @_);
1253 my $ref_content = $he->content();
1254 for (my $i = 0; $i<scalar(@{$ref_content}); $i++)
1255 { if ($ {$ref_content}[$i] =~ /^ *$/)
1256 { splice(@{$ref_content}, $i, 1);
1257 $i--; } }
1258}
1259
1260sub delete_trailing_spaces ( $ )
1261{ my ($he) = check_args(1, @_);
1262 my $ref_content = $he->content();
1263 if (! defined $ref_content)
1264 { return; }
1265 # Could also check for previous element = /^h[1-6]$/.
1266 for (my $i = 0; $i<scalar(@{$ref_content})-1; $i++)
1267 { if ($ {$ref_content}[$i] =~ /^ *$/)
1268 { my $next_elt = $ {$ref_content}[$i+1];
1269 if ((ref $next_elt) && ($next_elt->tag =~ /^(br|dd|dl|dt|hr|p|ul)$/))
1270 { splice(@{$ref_content}, $i, 1);
1271 $i--; } } }
1272 if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/)
1273 { my $last_elt = $ {$ref_content}[$#{$ref_content}];
1274 if ((defined $last_elt) && ($last_elt =~ /^ *$/))
1275 { pop @{$ref_content}; } }
1276}
1277
1278
1279# If we find a paragraph that looks like
1280# <P>
1281# <HR>
1282# <UL>
1283# then accumulate its links into a contents_list and delete the paragraph.
1284sub process_if_child_links ( $$$ )
1285{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1286 if (!$startflag)
1287 { return; }
1288
1289 if ($he->tag() eq "p")
1290 { my $ref_content = $he->content();
1291 if (defined $ref_content)
1292 { my @content = @{$ref_content};
1293 if ((scalar(@content) == 2)
1294 && (ref $content[0]) && $content[0]->tag() eq "hr"
1295 && (ref $content[1]) && $content[1]->tag() eq "ul")
1296 { process_child_links($he);
1297 $he->delete();
1298 return 0; } } }
1299 return 1;
1300}
1301
1302
1303# If we find
1304# <H4>
1305# "Footnotes"
1306# <DL>
1307# <DT>
1308# <A NAME="foot560">
1309# "...borrow"
1310# <A HREF="refcountsInPython.html#tex2html2" NAME="foot560">
1311# "1.2"
1312# <DD>
1313# "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. "
1314# ...
1315# then record the footnote information and delete the section and list.
1316
1317my $process_if_footnotes_expect_dl_next = 0;
1318
1319sub process_if_footnotes ( $$$ )
1320{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1321 if (!$startflag)
1322 { return; }
1323
1324 if (($he->tag() eq "h4")
1325 && has_single_content_string($he)
1326 && ($ {$he->content}[0] eq "Footnotes"))
1327 { html_replace_by_ignore($he);
1328 $process_if_footnotes_expect_dl_next = 1;
1329 return 0; }
1330
1331 if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl"))
1332 { my $ref_content = $he->content();
1333 if (defined $ref_content)
1334 { $process_if_footnotes_expect_dl_next = 0;
1335 my @content = @{$ref_content};
1336 for (my $i=0; $i<$#content; $i+=2)
1337 { my $he_dt = $content[$i];
1338 my $he_dd = $content[$i+1];
1339 if (($he_dt->tag ne "dt") || ($he_dd->tag ne "dd"))
1340 { $he->dump;
1341 die "expected <DT> and <DD> at positions $i and ", $i+1; }
1342 my @dt_content = @{$he_dt->content()};
1343 if ((scalar(@dt_content) != 2)
1344 || ($dt_content[0]->tag ne "a")
1345 || ($dt_content[1]->tag ne "a"))
1346 { $he_dt->dump;
1347 die "Expected 2 anchors as content of <DT>"; }
1348 my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]);
1349 my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]);
1350 # unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content
1351 if ($dt1_name ne $dt2_name)
1352 { $he_dt->dump;
1353 die "Expected identical names for anchors"; }
1354 html_replace_by_ignore($he_dd);
1355 $he_dd->tag("div"); # has no effect
1356 $footnotes{$dt1_name} = $he_dd; }
1357 html_replace_by_ignore($he);
1358 return 0; } }
1359
1360 if ($process_if_footnotes_expect_dl_next)
1361 { $he->dump;
1362 die "Expected <DL> for footnotes next"; }
1363
1364 return 1;
1365}
1366
1367
1368
1369## Merge two adjacent paragraphs containing <DL> items, such as:
1370# <P>
1371# <DL>
1372# <DT>
1373# ...
1374# <DD>
1375# ...
1376# <P>
1377# <DL>
1378# <DT>
1379# ...
1380# <DD>
1381# ...
1382
1383sub merge_dl ( $$$ )
1384{ my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument
1385 if (!$startflag)
1386 { return; }
1387
1388 my $ref_content = $he->content;
1389 if (!defined $ref_content)
1390 { return; }
1391 my $i = 0;
1392 while ($i < scalar(@{$ref_content})-1)
1393 { my $p1 = $ {$ref_content}[$i];
1394 if ((ref $p1) && ($p1->tag eq "p")
1395 && has_single_content_with_tag($p1, "dl"))
1396 { my $dl1 = $ {$p1->content}[0];
1397 # In this loop, rhs, not lhs, of < comparison changes,
1398 # because we are removing elements from the content of $he.
1399 while ($i < scalar(@{$ref_content})-1)
1400 { my $p2 = $ {$ref_content}[$i+1];
1401 if (!((ref $p2) && ($p2->tag eq "p")
1402 && has_single_content_with_tag($p2, "dl")))
1403 { last; }
1404 # Merge these two elements.
1405 splice(@{$ref_content}, $i+1, 1); # remove $p2
1406 my $dl2 = $ {$p2->content}[0];
1407 $dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1
1408 }
1409 # extra increment because next element isn't a candidate for $p1
1410 $i++; }
1411 $i++; }
1412 return 1;
1413}
1414
1415
1416
1417###########################################################################
1418### Testing
1419###
1420
1421sub test ( $$ )
1422{ my ($action, $file) = check_args(2, @_);
1423
1424 # General testing
1425 if (($action eq "view") || ($action eq ""))
1426 { # # $file = "/homes/gws/mernst/www/links.html";
1427 # # $file = "/homes/gws/mernst/www/index.html";
1428 # # $file = "/homes/fish/mernst/java/gud/doc/manual.html";
1429 # # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html";
1430 # # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html";
1431 # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
1432 my $tree = file_to_tree($file);
1433
1434 ## Testing
1435 # print STDERR $tree->as_HTML;
1436 $tree->dump();
1437
1438 # print STDERR $tree->tag(), "\n";
1439 # print STDERR @{$tree->content()}, "\n";
1440 #
1441 # for (@{ $tree->extract_links(qw(a img)) }) {
1442 # my ($link, $linkelem) = @$_;
1443 # print STDERR "$link ", $linkelem->as_HTML;
1444 # }
1445 #
1446 # print STDERR @{$tree->extract_links()}, "\n";
1447
1448 # my @top_level_elts = @{$tree->content()};
1449
1450 # if scalar(@{$tree->content()})
1451 return;
1452 }
1453
1454 elsif ($action eq "raw")
1455 { my $tree = new HTML::TreeBuilder;
1456 $tree->ignore_unknown(1);
1457 # $tree->warn(1);
1458 $tree->parse_file($file);
1459
1460 $tree->dump();
1461
1462 # cleanup_parse_tree($tree);
1463 # $tree->dump();
1464 return;
1465 }
1466
1467 # Test dealing with a section.
1468 elsif ($action eq "section")
1469 { # my $file;
1470 # $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html";
1471 # $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html";
1472 # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html";
1473 process_section_file($file, 0, "Title");
1474 }
1475
1476 # Test dealing with many sections
1477 elsif (0)
1478 { my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html",
1479 "/homes/fish/mernst/tmp/python-doc/html/api/abstract.html",
1480 "/homes/fish/mernst/tmp/python-doc/html/api/api.html",
1481 "/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html",
1482 "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html",
1483 "/homes/fish/mernst/tmp/python-doc/html/api/concrete.html",
1484 # "/homes/fish/mernst/tmp/python-doc/html/api/contents.html",
1485 "/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html",
1486 "/homes/fish/mernst/tmp/python-doc/html/api/debugging.html",
1487 "/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html",
1488 "/homes/fish/mernst/tmp/python-doc/html/api/embedding.html",
1489 "/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html",
1490 "/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html",
1491 "/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html",
1492 "/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html",
1493 "/homes/fish/mernst/tmp/python-doc/html/api/front.html",
1494 "/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html",
1495 # "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html",
1496 "/homes/fish/mernst/tmp/python-doc/html/api/importing.html",
1497 "/homes/fish/mernst/tmp/python-doc/html/api/includes.html",
1498 "/homes/fish/mernst/tmp/python-doc/html/api/index.html",
1499 "/homes/fish/mernst/tmp/python-doc/html/api/initialization.html",
1500 "/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html",
1501 "/homes/fish/mernst/tmp/python-doc/html/api/intro.html",
1502 "/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html",
1503 "/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html",
1504 "/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html",
1505 "/homes/fish/mernst/tmp/python-doc/html/api/mapping.html",
1506 "/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html",
1507 "/homes/fish/mernst/tmp/python-doc/html/api/node24.html",
1508 "/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html",
1509 "/homes/fish/mernst/tmp/python-doc/html/api/number.html",
1510 "/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html",
1511 "/homes/fish/mernst/tmp/python-doc/html/api/object.html",
1512 "/homes/fish/mernst/tmp/python-doc/html/api/objects.html",
1513 "/homes/fish/mernst/tmp/python-doc/html/api/os.html",
1514 "/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html",
1515 "/homes/fish/mernst/tmp/python-doc/html/api/processControl.html",
1516 "/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html",
1517 "/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html",
1518 "/homes/fish/mernst/tmp/python-doc/html/api/sequence.html",
1519 "/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html",
1520 "/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html",
1521 "/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html",
1522 "/homes/fish/mernst/tmp/python-doc/html/api/threads.html",
1523 "/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html",
1524 "/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html",
1525 "/homes/fish/mernst/tmp/python-doc/html/api/types.html",
1526 "/homes/fish/mernst/tmp/python-doc/html/api/utilities.html",
1527 "/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html");
1528 for my $file (@files)
1529 { print STDERR "\n", "=" x 75, "\n", "$file:\n";
1530 process_section_file($file, 0, "Title");
1531 }
1532 }
1533
1534 # Test dealing with index.
1535 elsif ($action eq "index")
1536 { # my $file;
1537 # $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html";
1538
1539 process_index_file($file, "\@cindex");
1540 print_index_info();
1541 }
1542
1543 else
1544 { die "Unrecognized action `$action'"; }
1545}
1546
1547
1548###########################################################################
1549### Main loop
1550###
1551
1552sub process_contents_file ( $ )
1553{ my ($file) = check_args(1, @_);
1554
1555 # could also use File::Basename
1556 my $info_file = $file;
1557 $info_file =~ s/(\/?index)?\.html$//;
1558 if ($info_file eq "")
1559 { chomp($info_file = `pwd`); }
1560 $info_file =~ s/^.*\///; # not the most efficient way to remove dirs
1561
1562 $html_directory = $file;
1563 $html_directory =~ s/(\/|^)[^\/]+$/$1/;
1564
1565 my $texi_file = "$info_file.texi";
1566 open(TEXI, ">$texi_file");
1567
1568 print TEXI "\\input texinfo \@c -*-texinfo-*-\n";
1569 print TEXI "\@c %**start of header\n";
1570 print TEXI "\@setfilename $info_file\n";
1571
1572 # 2. Summary Description and Copyright
1573 # The "Summary Description and Copyright" segment describes the
1574 # document and contains the copyright notice and copying permissions
1575 # for the Info file. The segment must be enclosed between `@ifinfo'
1576 # and `@end ifinfo' commands so that the formatters place it only in
1577 # the Info file.
1578 #
1579 # The summary description and copyright segment does not appear in the
1580 # printed document.
1581 #
1582 # @ifinfo
1583 # This is a short example of a complete Texinfo file.
1584 #
1585 # Copyright @copyright{} 1990 Free Software Foundation, Inc.
1586 # @end ifinfo
1587
1588
1589 # 3. Title and Copyright
1590 # The "Title and Copyright" segment contains the title and copyright
1591 # pages and copying permissions for the printed manual. The segment
1592 # must be enclosed between `@titlepage' and `@end titlepage'
1593 # commands. The title and copyright page appear only in the printed
1594 # manual.
1595 #
1596 # The titlepage segment does not appear in the Info file.
1597 #
1598 # @titlepage
1599 # @sp 10
1600 # @comment The title is printed in a large font.
1601 # @center @titlefont{Sample Title}
1602 #
1603 # @c The following two commands start the copyright page.
1604 # @page
1605 # @vskip 0pt plus 1filll
1606 # Copyright @copyright{} 1990 Free Software Foundation, Inc.
1607 # @end titlepage
1608
1609
1610 # 4. `Top' Node and Master Menu
1611 # The "Master Menu" contains a complete menu of all the nodes in the
1612 # whole Info file. It appears only in the Info file, in the `Top'
1613 # node.
1614 #
1615 # The `Top' node contains the master menu for the Info file. Since a
1616 # printed manual uses a table of contents rather than a menu, the master
1617 # menu appears only in the Info file.
1618 #
1619 # @node Top, First Chapter, , (dir)
1620 # @comment node-name, next, previous, up
1621 #
1622 # @menu
1623 # * First Chapter:: The first chapter is the
1624 # only chapter in this sample.
1625 # * Concept Index:: This index has two entries.
1626 # @end menu
1627
1628
1629
1630 $current_ref_tdf = [ "Top", 0, $ARGV[0] ];
1631 process_section_file($file, 0, "Top");
1632 while (scalar(@contents_list))
1633 { $current_ref_tdf = shift @contents_list;
1634 process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]);
1635 }
1636
1637 print TEXI "\n";
1638 for my $indextitle (@index_titles)
1639 { print TEXI "\@node $indextitle\n";
1640 print TEXI "\@unnumbered $indextitle\n";
1641 print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n";
1642 print TEXI "\n"; }
1643
1644 print TEXI "\@contents\n";
1645 print TEXI "\@bye\n";
1646 close(TEXI);
1647}
1648
1649# This needs to be last so global variable initializations are reached.
1650
1651if (scalar(@ARGV) == 0)
1652{ die "No arguments supplied to html2texi.pl"; }
1653
1654if ($ARGV[0] eq "-test")
1655{ my @test_args = @ARGV[1..$#ARGV];
1656 if (scalar(@test_args) == 0)
1657 { test("", "index.html"); }
1658 elsif (scalar(@test_args) == 1)
1659 { test("", $test_args[0]); }
1660 elsif (scalar(@test_args) == 2)
1661 { test($test_args[0], $test_args[1]); }
1662 else
1663 { die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); }
1664 exit();
1665}
1666
1667if (scalar(@ARGV) != 1)
1668{ die "Pass one argument, the main/contents page"; }
1669
1670process_contents_file($ARGV[0]);