Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 1 | #! /usr/bin/env perl |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 2 | # html2texi.pl -- Convert HTML documentation to Texinfo format |
| 3 | # Michael Ernst <mernst@cs.washington.edu> |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 4 | # Time-stamp: <1999-01-12 21:34:27 mernst> |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 5 | |
| 6 | # This program converts HTML documentation trees into Texinfo format. |
| 7 | # Given the name of a main (or contents) HTML file, it processes that file, |
| 8 | # and other files (transitively) referenced by it, into a Texinfo file |
| 9 | # (whose name is chosen from the file or directory name of the argument). |
| 10 | # For instance: |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 11 | # html2texi.pl api/index.html |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 12 | # produces file "api.texi". |
| 13 | |
| 14 | # Texinfo format can be easily converted to Info format (for browsing in |
| 15 | # Emacs or the standalone Info browser), to a printed manual, or to HTML. |
| 16 | # Thus, html2texi.pl permits conversion of HTML files to Info format, and |
| 17 | # secondarily enables producing printed versions of Web page hierarchies. |
| 18 | |
| 19 | # Unlike HTML, Info format is searchable. Since Info is integrated into |
| 20 | # Emacs, one can read documentation without starting a separate Web |
| 21 | # browser. Additionally, Info browsers (including Emacs) contain |
| 22 | # convenient features missing from Web browsers, such as easy index lookup |
| 23 | # and mouse-free browsing. |
| 24 | |
| 25 | # Limitations: |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 26 | # html2texi.pl is currently tuned to latex2html output (and it corrects |
| 27 | # several latex2html bugs), but should be extensible to arbitrary HTML |
| 28 | # documents. It will be most useful for HTML with a hierarchical structure |
| 29 | # and an index, and it recognizes those features as created by latex2html |
| 30 | # (and possibly by some other tools). The HTML tree to be traversed must |
| 31 | # be on local disk, rather than being accessed via HTTP. |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 32 | # This script requires the use of "checkargs.pm". To eliminate that |
| 33 | # dependence, replace calls to check_args* by @_ (which is always the last |
| 34 | # argument to those functions). |
| 35 | # Also see the "to do" section, below. |
| 36 | # Comments, suggestions, bug fixes, and enhancements are welcome. |
| 37 | |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 38 | # Troubleshooting: |
| 39 | # Malformed HTML can cause this program to abort, so |
| 40 | # you should check your HTML files to make sure they are legal. |
| 41 | |
| 42 | |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 43 | ### |
| 44 | ### Typical usage for the Python documentation: |
| 45 | ### |
| 46 | |
| 47 | # (Actually, most of this is in a Makefile instead.) |
| 48 | # The resulting Info format Python documentation is currently available at |
| 49 | # ftp://ftp.cs.washington.edu/homes/mernst/python-info.tar.gz |
| 50 | |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 51 | # Fix up HTML problems, eg <DT><DL COMPACT><DD> should be <DT><DL COMPACT><DD>. |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 52 | |
| 53 | # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/api/index.html |
| 54 | # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ext/index.html |
| 55 | # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/lib/index.html |
| 56 | # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/mac/index.html |
| 57 | # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/ref/index.html |
| 58 | # html2texi.pl /homes/fish/mernst/tmp/python-doc/html/tut/index.html |
| 59 | |
| 60 | # Edit the generated .texi files: |
| 61 | # * change @setfilename to prefix "python-" |
| 62 | # * fix up any sectioning, such as for Abstract |
| 63 | # * make Texinfo menus |
| 64 | # * perhaps remove the @detailmenu ... @end detailmenu |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 65 | # In Emacs, to do all this: |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 66 | # (progn (goto-char (point-min)) (replace-regexp "\\(@setfilename \\)\\([-a-z]*\\)$" "\\1python-\\2.info") (replace-string "@node Front Matter\n@chapter Abstract\n" "@node Abstract\n@section Abstract\n") (progn (mark-whole-buffer) (texinfo-master-menu 'update-all-nodes)) (save-buffer)) |
| 67 | |
| 68 | # makeinfo api.texi |
| 69 | # makeinfo ext.texi |
| 70 | # makeinfo lib.texi |
| 71 | # makeinfo mac.texi |
| 72 | # makeinfo ref.texi |
| 73 | # makeinfo tut.texi |
| 74 | |
| 75 | |
| 76 | ### |
| 77 | ### Structure of the code |
| 78 | ### |
| 79 | |
| 80 | # To be written... |
| 81 | |
| 82 | |
| 83 | ### |
| 84 | ### Design decisions |
| 85 | ### |
| 86 | |
| 87 | # Source and destination languages |
| 88 | # -------------------------------- |
| 89 | # |
| 90 | # The goal is Info files; I create Texinfo, so I don't have to worry about |
| 91 | # the finer details of Info file creation. (I'm not even sure of its exact |
| 92 | # format.) |
| 93 | # |
| 94 | # Why not start from LaTeX rather than HTML? |
| 95 | # I could hack latex2html itself to produce Texinfo instead, or fix up |
| 96 | # partparse.py (which already translates LaTeX to Teinfo). |
| 97 | # Pros: |
| 98 | # * has high-level information such as index entries, original formatting |
| 99 | # Cons: |
| 100 | # * those programs are complicated to read and understand |
| 101 | # * those programs try to handle arbitrary LaTeX input, track catcodes, |
| 102 | # and more: I don't want to go to that effort. HTML isn't as powerful |
| 103 | # as LaTeX, so there are fewer subtleties. |
| 104 | # * the result wouldn't work for arbitrary HTML documents; it would be |
| 105 | # nice to eventually extend this program to HTML produced from Docbook, |
| 106 | # Frame, and more. |
| 107 | |
| 108 | # Parsing |
| 109 | # ------- |
| 110 | # |
| 111 | # I don't want to view the text as a linear stream; I'd rather parse the |
| 112 | # whole thing and then do pattern matching over the parsed representation (to |
| 113 | # find idioms such as indices, lists of child nodes, etc.). |
| 114 | # * Perl provides HTML::TreeBuilder, which does just what I want. |
| 115 | # * libwww-perl: http://www.linpro.no/lwp/ |
| 116 | # * TreeBuilder: HTML-Tree-0.51.tar.gz |
| 117 | # * Python Parsers, Formatters, and Writers don't really provide the right |
| 118 | # interface (and the version in Grail doesn't correspond to another |
| 119 | # distributed version, so I'm confused about which to be using). I could |
| 120 | # write something in Python that creates a parse tree, but why bother? |
| 121 | |
| 122 | # Other implementation language issues: |
| 123 | # * Python lacks variable declarations, reasonable scoping, and static |
| 124 | # checking tools. I've written some of the latter for myself that make |
| 125 | # my Perl programming a lot safer than my Python programming will be until |
| 126 | # I have a similar suite for that language. |
| 127 | |
| 128 | |
| 129 | ########################################################################### |
| 130 | ### To do |
| 131 | ### |
| 132 | |
| 133 | # Section names: |
| 134 | # Fix the problem with multiple sections in a single file (eg, Abstract in |
| 135 | # Front Matter section). |
| 136 | # Deal with cross-references, as in /homes/fish/mernst/tmp/python-doc/html/ref/types.html:310 |
| 137 | # Index: |
| 138 | # Perhaps double-check that every tag mentioned in the index is found |
| 139 | # in the text. |
Fred Drake | 9d84308 | 2003-07-30 02:55:28 +0000 | [diff] [blame] | 140 | # Python: email to docs@python.org, to get their feedback. |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 141 | # Compare to existing lib/ Info manual |
| 142 | # Write the hooks into info-look; replace pyliblookup1-1.tar.gz. |
| 143 | # Postpass to remove extra quotation marks around typography already in |
| 144 | # a different font (to avoid double delimiters as in "`code'"); or |
| 145 | # perhaps consider using only font-based markup so that we don't get |
| 146 | # the extra *bold* and `code' markup in Info. |
| 147 | |
| 148 | ## Perhaps don't rely on automatic means for adding up, next, prev; I have |
| 149 | ## all that info available to me already, so it's not so much trouble to |
| 150 | ## add it. (Right?) But it is *so* easy to use Emacs instead... |
| 151 | |
| 152 | |
| 153 | ########################################################################### |
| 154 | ### Strictures |
| 155 | ### |
| 156 | |
| 157 | # man HTML::TreeBuilder |
| 158 | # man HTML::Parser |
| 159 | # man HTML::Element |
| 160 | |
| 161 | # require HTML::ParserWComment; |
| 162 | require HTML::Parser; |
| 163 | require HTML::TreeBuilder; |
| 164 | require HTML::Element; |
| 165 | |
| 166 | use File::Basename; |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 167 | |
| 168 | use strict; |
| 169 | # use Carp; |
| 170 | |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 171 | use checkargs; |
| 172 | |
| 173 | |
| 174 | ########################################################################### |
| 175 | ### Variables |
| 176 | ### |
| 177 | |
| 178 | my @section_stack = (); # elements are chapter/section/subsec nodetitles (I think) |
| 179 | my $current_ref_tdf; # for the file currently being processed; |
| 180 | # used in error messages |
| 181 | my $html_directory; |
| 182 | my %footnotes; |
| 183 | |
| 184 | # First element should not be used. |
| 185 | my @sectionmarker = ("manual", "chapter", "section", "subsection", "subsubsection"); |
| 186 | |
| 187 | my %inline_markup = ("b" => "strong", |
| 188 | "code" => "code", |
| 189 | "i" => "emph", |
| 190 | "kbd" => "kbd", |
| 191 | "samp" => "samp", |
| 192 | "strong" => "strong", |
| 193 | "tt" => "code", |
| 194 | "var" => "var"); |
| 195 | |
| 196 | my @deferred_index_entries = (); |
| 197 | |
| 198 | my @index_titles = (); # list of (filename, type) lists |
| 199 | my %index_info = ("Index" => ["\@blindex", "bl"], |
| 200 | "Concept Index" => ["\@cindex", "cp"], |
| 201 | "Module Index" => ["\@mdindex", "md"]); |
| 202 | |
| 203 | |
| 204 | ########################################################################### |
| 205 | ### Main/contents page |
| 206 | ### |
| 207 | |
| 208 | # Process first-level page on its own, or just a contents page? Well, I do |
| 209 | # want the title, author, etc., and the front matter... For now, just add |
| 210 | # that by hand at the end. |
| 211 | |
| 212 | |
| 213 | # data structure possibilities: |
| 214 | # * tree-like (need some kind of stack when processing (or parent pointers)) |
| 215 | # * list of name and depth; remember old and new depths. |
| 216 | |
| 217 | # Each element is a reference to a list of (nodetitle, depth, filename). |
| 218 | my @contents_list = (); |
| 219 | |
| 220 | # The problem with doing fixups on the fly is that some sections may have |
| 221 | # already been processed (and no longer available) by the time we notice |
| 222 | # others with the same name. It's probably better to fully construct the |
| 223 | # contents list (reading in all files of interest) upfront; that will also |
| 224 | # let me do a better job with cross-references, because again, all files |
| 225 | # will already be read in. |
| 226 | my %contents_hash = (); |
| 227 | my %contents_fixups = (); |
| 228 | |
| 229 | my @current_contents_list = (); |
| 230 | |
| 231 | # Merge @current_contents_list into @contents_list, |
| 232 | # and set @current_contents_list to be empty. |
| 233 | sub merge_contents_lists ( ) |
| 234 | { check_args(0, @_); |
| 235 | |
| 236 | # Three possibilities: |
| 237 | # * @contents_list is empty: replace it by @current_contents_list. |
| 238 | # * prefixes of the two lists are identical: do nothing |
| 239 | # * @current_contents_list is all at lower level than $contents_list[0]; |
| 240 | # prefix @contents_list by @current_contents_list |
| 241 | |
| 242 | if (scalar(@current_contents_list) == 0) |
| 243 | { die "empty current_contents_list"; } |
| 244 | |
| 245 | # if (scalar(@contents_list) == 0) |
| 246 | # { @contents_list = @current_contents_list; |
| 247 | # @current_contents_list = (); |
| 248 | # return; } |
| 249 | |
| 250 | # if (($ {$contents_list[0]}[1]) < ($ {$current_contents_list[0]}[1])) |
| 251 | # { unshift @contents_list, @current_contents_list; |
| 252 | # @current_contents_list = (); |
| 253 | # return; } |
| 254 | |
| 255 | for (my $i=0; $i<scalar(@current_contents_list); $i++) |
| 256 | { my $ref_c_tdf = $current_contents_list[$i]; |
| 257 | if ($i >= scalar(@contents_list)) |
| 258 | { push @contents_list, $ref_c_tdf; |
| 259 | my $title = $ {$ref_c_tdf}[0]; |
| 260 | if (defined $contents_hash{$title}) |
| 261 | { $contents_fixups{$title} = 1; } |
| 262 | else |
| 263 | { $contents_hash{$title} = 1; } |
| 264 | next; } |
| 265 | my $ref_tdf = $contents_list[$i]; |
| 266 | my ($title, $depth, $file) = @{$ref_tdf}; |
| 267 | my ($c_title, $c_depth, $c_file) = @{$ref_c_tdf}; |
| 268 | |
| 269 | if (($title ne $c_title) |
| 270 | && ($depth < $c_depth) |
| 271 | && ($file ne $c_file)) |
| 272 | { splice @contents_list, $i, 0, $ref_c_tdf; |
| 273 | if (defined $contents_hash{$c_title}) |
| 274 | { $contents_fixups{$c_title} = 1; } |
| 275 | else |
| 276 | { $contents_hash{$c_title} = 1; } |
| 277 | next; } |
| 278 | |
| 279 | if (($title ne $c_title) |
| 280 | || ($depth != $c_depth) |
| 281 | || ($file ne $c_file)) |
| 282 | { die ("while processing $ {$current_ref_tdf}[2] at depth $ {$current_ref_tdf}[1], mismatch at index $i:", |
| 283 | "\n main: <<<$title>>> $depth $file", |
| 284 | "\n curr: <<<$c_title>>> $c_depth $c_file"); } |
| 285 | } |
| 286 | @current_contents_list = (); |
| 287 | } |
| 288 | |
| 289 | |
| 290 | |
| 291 | # Set @current_contents_list to a list of (title, href, sectionlevel); |
| 292 | # then merge that list into @contents_list. |
| 293 | # Maybe this function should also produce a map |
| 294 | # from title (or href) to sectionlevel (eg "chapter"?). |
| 295 | sub process_child_links ( $ ) |
| 296 | { my ($he) = check_args(1, @_); |
| 297 | |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 298 | # $he->dump(); |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 299 | if (scalar(@current_contents_list) != 0) |
| 300 | { die "current_contents_list nonempty: @current_contents_list"; } |
| 301 | $he->traverse(\&increment_current_contents_list, 'ignore text'); |
| 302 | |
| 303 | # Normalize the depths; for instance, convert 1,3,5 into 0,1,2. |
| 304 | my %depths = (); |
| 305 | for my $ref_tdf (@current_contents_list) |
| 306 | { $depths{$ {$ref_tdf}[1]} = 1; } |
| 307 | my @sorted_depths = sort keys %depths; |
| 308 | my $current_depth = scalar(@section_stack)-1; |
| 309 | my $current_depth_2 = $ {$current_ref_tdf}[1]; |
| 310 | if ($current_depth != $current_depth_2) |
| 311 | { die "mismatch in current depths: $current_depth $current_depth_2; ", join(", ", @section_stack); } |
| 312 | for (my $i=0; $i<scalar(@sorted_depths); $i++) |
| 313 | { $depths{$sorted_depths[$i]} = $i + $current_depth+1; } |
| 314 | for my $ref_tdf (@current_contents_list) |
| 315 | { $ {$ref_tdf}[1] = $depths{$ {$ref_tdf}[1]}; } |
| 316 | |
| 317 | # Eliminate uninteresting sections. Hard-coded hack for now. |
| 318 | if ($ {$current_contents_list[-1]}[0] eq "About this document ...") |
| 319 | { pop @current_contents_list; } |
| 320 | if ((scalar(@current_contents_list) > 1) |
| 321 | && ($ {$current_contents_list[1]}[0] eq "Contents")) |
| 322 | { my $ref_first_tdf = shift @current_contents_list; |
| 323 | $current_contents_list[0] = $ref_first_tdf; } |
| 324 | |
| 325 | for (my $i=0; $i<scalar(@current_contents_list); $i++) |
| 326 | { my $ref_tdf = $current_contents_list[$i]; |
| 327 | my $title = $ {$ref_tdf}[0]; |
| 328 | if (exists $index_info{$title}) |
| 329 | { my $index_file = $ {$ref_tdf}[2]; |
| 330 | my ($indexing_command, $suffix) = @{$index_info{$title}}; |
| 331 | process_index_file($index_file, $indexing_command); |
| 332 | print TEXI "\n\@defindex $suffix\n"; |
| 333 | push @index_titles, $title; |
| 334 | splice @current_contents_list, $i, 1; |
| 335 | $i--; } |
| 336 | elsif ($title =~ /\bIndex$/) |
| 337 | { print STDERR "Warning: \"$title\" might be an index; if so, edit \%index_info.\n"; } } |
| 338 | |
| 339 | merge_contents_lists(); |
| 340 | |
| 341 | # print_contents_list(); |
| 342 | # print_index_info(); |
| 343 | } |
| 344 | |
| 345 | |
| 346 | sub increment_current_contents_list ( $$$ ) |
| 347 | { my ($he, $startflag, $depth) = check_args(3, @_); |
| 348 | if (!$startflag) |
| 349 | { return; } |
| 350 | |
| 351 | if ($he->tag eq "li") |
| 352 | { my @li_content = @{$he->content}; |
| 353 | if ($li_content[0]->tag ne "a") |
| 354 | { die "first element of <LI> should be <A>"; } |
| 355 | my ($name, $href, @content) = anchor_info($li_content[0]); |
| 356 | # unused $name |
| 357 | my $title = join("", collect_texts($li_content[0])); |
| 358 | $title = texi_remove_punctuation($title); |
| 359 | # The problem with these is that they are formatted differently in |
| 360 | # @menu and @node! |
| 361 | $title =~ s/``/\"/g; |
| 362 | $title =~ s/''/\"/g; |
| 363 | $title =~ s/ -- / /g; |
| 364 | push @current_contents_list, [ $title, $depth, $href ]; } |
| 365 | return 1; |
| 366 | } |
| 367 | |
| 368 | # Simple version for section titles |
| 369 | sub html_to_texi ( $ ) |
| 370 | { my ($he) = check_args(1, @_); |
| 371 | if (!ref $he) |
| 372 | { return $he; } |
| 373 | |
| 374 | my $tag = $he->tag; |
| 375 | if (exists $inline_markup{$tag}) |
| 376 | { my $result = "\@$inline_markup{$tag}\{"; |
| 377 | for my $elt (@{$he->content}) |
| 378 | { $result .= html_to_texi($elt); } |
| 379 | $result .= "\}"; |
| 380 | return $result; } |
| 381 | else |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 382 | { $he->dump(); |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 383 | die "html_to_texi confused by <$tag>"; } |
| 384 | } |
| 385 | |
| 386 | |
| 387 | |
| 388 | sub print_contents_list () |
| 389 | { check_args(0, @_); |
| 390 | print STDERR "Contents list:\n"; |
| 391 | for my $ref_tdf (@contents_list) |
| 392 | { my ($title, $depth, $file) = @{$ref_tdf}; |
| 393 | print STDERR "$title $depth $file\n"; } |
| 394 | } |
| 395 | |
| 396 | |
| 397 | |
| 398 | ########################################################################### |
| 399 | ### Index |
| 400 | ### |
| 401 | |
| 402 | my $l2h_broken_link_name = "l2h-"; |
| 403 | |
| 404 | |
| 405 | # map from file to (map from anchor name to (list of index texts)) |
| 406 | # (The list is needed when a single LaTeX command like \envvar |
| 407 | # expands to multiple \index commands.) |
| 408 | my %file_index_entries = (); |
| 409 | my %this_index_entries; # map from anchor name to (list of index texts) |
| 410 | |
| 411 | my %file_index_entries_broken = (); # map from file to (list of index texts) |
| 412 | my @this_index_entries_broken; |
| 413 | |
| 414 | my $index_prefix = ""; |
| 415 | my @index_prefixes = (); |
| 416 | |
| 417 | my $this_indexing_command; |
| 418 | |
| 419 | sub print_index_info () |
| 420 | { check_args(0, @_); |
| 421 | my ($key, $val); |
| 422 | for my $file (sort keys %file_index_entries) |
| 423 | { my %index_entries = %{$file_index_entries{$file}}; |
| 424 | print STDERR "file: $file\n"; |
| 425 | for my $aname (sort keys %index_entries) |
| 426 | { my @entries = @{$index_entries{$aname}}; |
| 427 | if (scalar(@entries) == 1) |
| 428 | { print STDERR " $aname : $entries[0]\n"; } |
| 429 | else |
| 430 | { print STDERR " $aname : ", join("\n " . (" " x length($aname)), @entries), "\n"; } } } |
| 431 | for my $file (sort keys %file_index_entries_broken) |
| 432 | { my @entries = @{$file_index_entries_broken{$file}}; |
| 433 | print STDERR "file: $file\n"; |
| 434 | for my $entry (@entries) |
| 435 | { print STDERR " $entry\n"; } |
| 436 | } |
| 437 | } |
| 438 | |
| 439 | |
| 440 | sub process_index_file ( $$ ) |
| 441 | { my ($file, $indexing_command) = check_args(2, @_); |
| 442 | # print "process_index_file $file $indexing_command\n"; |
| 443 | |
| 444 | my $he = file_to_tree($html_directory . $file); |
| 445 | # $he->dump(); |
| 446 | |
| 447 | $this_indexing_command = $indexing_command; |
| 448 | $he->traverse(\&process_if_index_dl_compact, 'ignore text'); |
| 449 | undef $this_indexing_command; |
| 450 | # print "process_index_file done\n"; |
| 451 | } |
| 452 | |
| 453 | |
| 454 | sub process_if_index_dl_compact ( $$$ ) |
| 455 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument |
| 456 | if (!$startflag) |
| 457 | { return; } |
| 458 | |
| 459 | if (($he->tag() eq "dl") && (defined $he->attr('compact'))) |
| 460 | { process_index_dl_compact($he); |
| 461 | return 0; } |
| 462 | else |
| 463 | { return 1; } |
| 464 | } |
| 465 | |
| 466 | |
| 467 | # The elements of a <DL COMPACT> list from a LaTeX2HTML index: |
| 468 | # * a single space: text to be ignored |
| 469 | # * <DT> elements with an optional <DD> element following each one |
| 470 | # Two types of <DT> elements: |
| 471 | # * Followed by a <DD> element: the <DT> contains a single |
| 472 | # string, and the <DD> contains a whitespace string to be ignored, a |
| 473 | # <DL COMPACT> to be recursively processed (with the <DT> string as a |
| 474 | # prefix), and a whitespace string to be ignored. |
| 475 | # * Not followed by a <DD> element: contains a list of anchors |
| 476 | # and texts (ignore the texts, which are only whitespace and commas). |
| 477 | # Optionally contains a <DL COMPACT> to be recursively processed (with |
| 478 | # the <DT> string as a prefix) |
| 479 | sub process_index_dl_compact ( $ ) |
| 480 | { my ($h) = check_args(1, @_); |
| 481 | my @content = @{$h->content()}; |
| 482 | for (my $i = 0; $i < scalar(@content); $i++) |
| 483 | { my $this_he = $content[$i]; |
| 484 | if ($this_he->tag ne "dt") |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 485 | { $this_he->dump(); |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 486 | die "Expected <DT> tag: " . $this_he->tag; } |
| 487 | if (($i < scalar(@content) - 1) && ($content[$i+1]->tag eq "dd")) |
| 488 | { process_index_dt_and_dd($this_he, $content[$i+1]); |
| 489 | $i++; } |
| 490 | else |
| 491 | { process_index_lone_dt($this_he); } } } |
| 492 | |
| 493 | |
| 494 | |
| 495 | # Argument is a <DT> element. If it contains more than one anchor, then |
| 496 | # the texts of all subsequent ones are "[Link]". Example: |
| 497 | # <DT> |
| 498 | # <A HREF="embedding.html#l2h-201"> |
| 499 | # "$PATH" |
| 500 | # ", " |
| 501 | # <A HREF="embedding.html#l2h-205"> |
| 502 | # "[Link]" |
| 503 | # Optionally contains a <DL COMPACT> as well. Example: |
| 504 | # <DT> |
| 505 | # <A HREF="types.html#l2h-616"> |
| 506 | # "attribute" |
| 507 | # <DL COMPACT> |
| 508 | # <DT> |
| 509 | # <A HREF="assignment.html#l2h-3074"> |
| 510 | # "assignment" |
| 511 | # ", " |
| 512 | # <A HREF="assignment.html#l2h-3099"> |
| 513 | # "[Link]" |
| 514 | # <DT> |
| 515 | # <A HREF="types.html#l2h-"> |
| 516 | # "assignment, class" |
| 517 | |
| 518 | sub process_index_lone_dt ( $ ) |
| 519 | { my ($dt) = check_args(1, @_); |
| 520 | my @dtcontent = @{$dt->content()}; |
| 521 | my $acontent; |
| 522 | my $acontent_suffix; |
| 523 | for my $a (@dtcontent) |
| 524 | { if ($a eq ", ") |
| 525 | { next; } |
| 526 | if (!ref $a) |
| 527 | { $dt->dump; |
| 528 | die "Unexpected <DT> string element: $a"; } |
| 529 | |
| 530 | if ($a->tag eq "dl") |
| 531 | { push @index_prefixes, $index_prefix; |
| 532 | if (!defined $acontent_suffix) |
| 533 | { die "acontent_suffix not yet defined"; } |
| 534 | $index_prefix .= $acontent_suffix . ", "; |
| 535 | process_index_dl_compact($a); |
| 536 | $index_prefix = pop(@index_prefixes); |
| 537 | return; } |
| 538 | |
| 539 | if ($a->tag ne "a") |
| 540 | { $dt->dump; |
| 541 | $a->dump; |
| 542 | die "Expected anchor in lone <DT>"; } |
| 543 | |
| 544 | my ($aname, $ahref, @acontent) = anchor_info($a); |
| 545 | # unused $aname |
| 546 | if (scalar(@acontent) != 1) |
| 547 | { die "Expected just one content of <A> in <DT>: @acontent"; } |
| 548 | if (ref $acontent[0]) |
| 549 | { $acontent[0]->dump; |
| 550 | die "Expected string content of <A> in <DT>: $acontent[0]"; } |
| 551 | if (!defined($acontent)) |
| 552 | { $acontent = $index_prefix . $acontent[0]; |
| 553 | $acontent_suffix = $acontent[0]; } |
| 554 | elsif (($acontent[0] ne "[Link]") && ($acontent ne ($index_prefix . $acontent[0]))) |
| 555 | { die "Differing content: <<<$acontent>>>, <<<$acontent[0]>>>"; } |
| 556 | |
| 557 | if (!defined $ahref) |
| 558 | { $dt->dump; |
| 559 | die "no HREF in nachor in <DT>"; } |
| 560 | my ($ahref_file, $ahref_name) = split(/\#/, $ahref); |
| 561 | if (!defined $ahref_name) |
| 562 | { # Reference to entire file |
| 563 | $ahref_name = ""; } |
| 564 | |
| 565 | if ($ahref_name eq $l2h_broken_link_name) |
| 566 | { if (!exists $file_index_entries_broken{$ahref_file}) |
| 567 | { $file_index_entries_broken{$ahref_file} = []; } |
| 568 | push @{$file_index_entries_broken{$ahref_file}}, "$this_indexing_command $acontent"; |
| 569 | next; } |
| 570 | |
| 571 | if (!exists $file_index_entries{$ahref_file}) |
| 572 | { $file_index_entries{$ahref_file} = {}; } |
| 573 | # Don't do this! It appears to make a copy, which is not desired. |
| 574 | # my %index_entries = %{$file_index_entries{$ahref_file}}; |
| 575 | if (!exists $ {$file_index_entries{$ahref_file}}{$ahref_name}) |
| 576 | { $ {$file_index_entries{$ahref_file}}{$ahref_name} = []; } |
| 577 | # { my $oldcontent = $ {$file_index_entries{$ahref_file}}{$ahref_name}; |
| 578 | # if ($acontent eq $oldcontent) |
| 579 | # { die "Multiple identical index entries?"; } |
| 580 | # die "Trying to add $acontent, but already have index entry pointing at $ahref_file\#$ahref_name: ${$file_index_entries{$ahref_file}}{$ahref_name}"; } |
| 581 | |
| 582 | push @{$ {$file_index_entries{$ahref_file}}{$ahref_name}}, "$this_indexing_command $acontent"; |
| 583 | # print STDERR "keys: ", keys %{$file_index_entries{$ahref_file}}, "\n"; |
| 584 | } |
| 585 | } |
| 586 | |
| 587 | sub process_index_dt_and_dd ( $$ ) |
| 588 | { my ($dt, $dd) = check_args(2, @_); |
| 589 | my $dtcontent; |
| 590 | { my @dtcontent = @{$dt->content()}; |
| 591 | if ((scalar(@dtcontent) != 1) || (ref $dtcontent[0])) |
| 592 | { $dd->dump; |
| 593 | $dt->dump; |
| 594 | die "Expected single string (actual size = " . scalar(@dtcontent) . ") in content of <DT>: @dtcontent"; } |
| 595 | $dtcontent = $dtcontent[0]; |
| 596 | $dtcontent =~ s/ +$//; } |
| 597 | my $ddcontent; |
| 598 | { my @ddcontent = @{$dd->content()}; |
| 599 | if (scalar(@ddcontent) != 1) |
| 600 | { die "Expected single <DD> content, got ", scalar(@ddcontent), " elements:\n", join("\n", @ddcontent), "\n "; } |
| 601 | $ddcontent = $ddcontent[0]; } |
| 602 | if ($ddcontent->tag ne "dl") |
| 603 | { die "Expected <DL> as content of <DD>, but saw: $ddcontent"; } |
| 604 | |
| 605 | push @index_prefixes, $index_prefix; |
| 606 | $index_prefix .= $dtcontent . ", "; |
| 607 | process_index_dl_compact($ddcontent); |
| 608 | $index_prefix = pop(@index_prefixes); |
| 609 | } |
| 610 | |
| 611 | |
| 612 | ########################################################################### |
| 613 | ### Ordinary sections |
| 614 | ### |
| 615 | |
| 616 | sub process_section_file ( $$$ ) |
| 617 | { my ($file, $depth, $nodetitle) = check_args(3, @_); |
| 618 | my $he = file_to_tree(($file =~ /^\//) ? $file : $html_directory . $file); |
| 619 | |
| 620 | # print STDERR "process_section_file: $file $depth $nodetitle\n"; |
| 621 | |
| 622 | # Equivalently: |
| 623 | # while ($depth >= scalar(@section_stack)) { pop(@section_stack); } |
| 624 | @section_stack = @section_stack[0..$depth-1]; |
| 625 | |
| 626 | # Not a great nodename fixup scheme; need a more global view |
| 627 | if ((defined $contents_fixups{$nodetitle}) |
| 628 | && (scalar(@section_stack) > 0)) |
| 629 | { my $up_title = $section_stack[$#section_stack]; |
| 630 | # hack for Python Standard Library |
| 631 | $up_title =~ s/^(Built-in|Standard) Module //g; |
| 632 | my ($up_first_word) = split(/ /, $up_title); |
| 633 | $nodetitle = "$up_first_word $nodetitle"; |
| 634 | } |
| 635 | |
| 636 | push @section_stack, $nodetitle; |
| 637 | # print STDERR "new section_stack: ", join(", ", @section_stack), "\n"; |
| 638 | |
| 639 | $he->traverse(\&process_if_child_links, 'ignore text'); |
| 640 | %footnotes = (); |
| 641 | # $he->dump; |
| 642 | $he->traverse(\&process_if_footnotes, 'ignore text'); |
| 643 | |
| 644 | # $he->dump; |
| 645 | |
| 646 | if (exists $file_index_entries{$file}) |
| 647 | { %this_index_entries = %{$file_index_entries{$file}}; |
| 648 | # print STDERR "this_index_entries:\n ", join("\n ", keys %this_index_entries), "\n"; |
| 649 | } |
| 650 | else |
| 651 | { # print STDERR "Warning: no index entries for file $file\n"; |
| 652 | %this_index_entries = (); } |
| 653 | |
| 654 | if (exists $file_index_entries_broken{$file}) |
| 655 | { @this_index_entries_broken = @{$file_index_entries_broken{$file}}; } |
| 656 | else |
| 657 | { # print STDERR "Warning: no index entries for file $file\n"; |
| 658 | @this_index_entries_broken = (); } |
| 659 | |
| 660 | |
| 661 | if ($he->tag() ne "html") |
| 662 | { die "Expected <HTML> at top level"; } |
| 663 | my @content = @{$he->content()}; |
| 664 | if ((!ref $content[0]) or ($content[0]->tag ne "head")) |
| 665 | { $he->dump; |
| 666 | die "<HEAD> not first element of <HTML>"; } |
| 667 | if ((!ref $content[1]) or ($content[1]->tag ne "body")) |
| 668 | { $he->dump; |
| 669 | die "<BODY> not second element of <HTML>"; } |
| 670 | |
| 671 | $content[1]->traverse(\&output_body); |
| 672 | } |
| 673 | |
| 674 | # stack of things we're inside that are preventing indexing from occurring now. |
| 675 | # These are "h1", "h2", "h3", "h4", "h5", "h6", "dt" (and possibly others?) |
| 676 | my @index_deferrers = (); |
| 677 | |
| 678 | sub push_or_pop_index_deferrers ( $$ ) |
| 679 | { my ($tag, $startflag) = check_args(2, @_); |
| 680 | if ($startflag) |
| 681 | { push @index_deferrers, $tag; } |
| 682 | else |
| 683 | { my $old_deferrer = pop @index_deferrers; |
| 684 | if ($tag ne $old_deferrer) |
| 685 | { die "Expected $tag at top of index_deferrers but saw $old_deferrer; remainder = ", join(" ", @index_deferrers); } |
| 686 | do_deferred_index_entries(); } |
| 687 | } |
| 688 | |
| 689 | |
| 690 | sub label_add_index_entries ( $;$ ) |
| 691 | { my ($label, $he) = check_args_range(1, 2, @_); |
| 692 | # print ((exists $this_index_entries{$label}) ? "*" : " "), " label_add_index_entries $label\n"; |
| 693 | # $he is the anchor element |
| 694 | if (exists $this_index_entries{$label}) |
| 695 | { push @deferred_index_entries, @{$this_index_entries{$label}}; |
| 696 | return; } |
| 697 | |
| 698 | if ($label eq $l2h_broken_link_name) |
| 699 | { # Try to find some text to use in guessing which links should point here |
| 700 | # I should probably only look at the previous element, or if that is |
| 701 | # all punctuation, the one before it; collecting all the previous texts |
| 702 | # is a bit of overkill. |
| 703 | my @anchor_texts = collect_texts($he); |
| 704 | my @previous_texts = collect_texts($he->parent, $he); |
| 705 | # 4 elements is arbitrary; ought to filter out punctuation and small words |
| 706 | # first, then perhaps keep fewer. Perhaps also filter out formatting so |
| 707 | # that we can see a larger chunk of text? (Probably not.) |
| 708 | # Also perhaps should do further chunking into words, in case the |
| 709 | # index term isn't a chunk of its own (eg, was in <tt>...</tt>. |
| 710 | my @candidate_texts = (@anchor_texts, (reverse(@previous_texts))[0..min(3,$#previous_texts)]); |
| 711 | |
| 712 | my $guessed = 0; |
| 713 | for my $text (@candidate_texts) |
| 714 | { # my $orig_text = $text; |
| 715 | if ($text =~ /^[\"\`\'().?! ]*$/) |
| 716 | { next; } |
| 717 | if (length($text) <= 2) |
| 718 | { next; } |
| 719 | # hack for Python manual; maybe defer until failure first time around? |
| 720 | $text =~ s/^sys\.//g; |
| 721 | for my $iterm (@this_index_entries_broken) |
| 722 | { # I could test for zero: LaTeX2HTML's failures in the Python |
| 723 | # documentation are only for items of the form "... (built-in...)" |
| 724 | if (index($iterm, $text) != -1) |
| 725 | { push @deferred_index_entries, $iterm; |
| 726 | # print STDERR "Guessing index term `$iterm' for text `$orig_text'\n"; |
| 727 | $guessed = 1; |
| 728 | } } } |
| 729 | if (!$guessed) |
| 730 | { # print STDERR "No guess in `", join("'; `", @this_index_entries_broken), "' for texts:\n `", join("'\n `", @candidate_texts), "'\n"; |
| 731 | } |
| 732 | } |
| 733 | } |
| 734 | |
| 735 | |
| 736 | # Need to add calls to this at various places. |
| 737 | # Perhaps add HTML::Element argument and do the check for appropriateness |
| 738 | # here (ie, no action if inside <H1>, etc.). |
| 739 | sub do_deferred_index_entries () |
| 740 | { check_args(0, @_); |
| 741 | if ((scalar(@deferred_index_entries) > 0) |
| 742 | && (scalar(@index_deferrers) == 0)) |
| 743 | { print TEXI "\n", join("\n", @deferred_index_entries), "\n"; |
| 744 | @deferred_index_entries = (); } |
| 745 | } |
| 746 | |
| 747 | my $table_columns; # undefined if not in a table |
| 748 | my $table_first_column; # boolean |
| 749 | |
| 750 | sub output_body ( $$$ ) |
| 751 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument |
| 752 | |
| 753 | if (!ref $he) |
| 754 | { my $space_index = index($he, " "); |
| 755 | if ($space_index != -1) |
| 756 | { # Why does |
| 757 | # print TEXI texi_quote(substr($he, 0, $space_index+1)); |
| 758 | # give: Can't locate object method "TEXI" via package "texi_quote" |
| 759 | # (Because the definition texi_quote hasn't been seen yet.) |
| 760 | print TEXI &texi_quote(substr($he, 0, $space_index+1)); |
| 761 | do_deferred_index_entries(); |
| 762 | print TEXI &texi_quote(substr($he, $space_index+1)); } |
| 763 | else |
| 764 | { print TEXI &texi_quote($he); } |
| 765 | return; } |
| 766 | |
| 767 | my $tag = $he->tag(); |
| 768 | |
| 769 | # Ordinary text markup first |
| 770 | if (exists $inline_markup{$tag}) |
| 771 | { if ($startflag) |
| 772 | { print TEXI "\@$inline_markup{$tag}\{"; } |
| 773 | else |
| 774 | { print TEXI "\}"; } } |
| 775 | elsif ($tag eq "a") |
| 776 | { my ($name, $href, @content) = anchor_info($he); |
| 777 | if (!$href) |
| 778 | { # This anchor is only here for indexing/cross referencing purposes. |
| 779 | if ($startflag) |
| 780 | { label_add_index_entries($name, $he); } |
| 781 | } |
| 782 | elsif ($href =~ "^(ftp|http|news):") |
| 783 | { if ($startflag) |
| 784 | { # Should avoid second argument if it's identical to the URL. |
| 785 | print TEXI "\@uref\{$href, "; } |
| 786 | else |
| 787 | { print TEXI "\}"; } |
| 788 | } |
| 789 | elsif ($href =~ /^\#(foot[0-9]+)$/) |
| 790 | { # Footnote |
| 791 | if ($startflag) |
| 792 | { # Could double-check name and content, but I'm not |
| 793 | # currently storing that information. |
| 794 | print TEXI "\@footnote\{"; |
| 795 | $footnotes{$1}->traverse(\&output_body); |
| 796 | print TEXI "\}"; |
| 797 | return 0; } } |
| 798 | else |
| 799 | { if ($startflag) |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 800 | { # cross-references are not active Info links, but no text is lost |
| 801 | print STDERR "Can't deal with internal HREF anchors yet:\n"; |
| 802 | $he->dump; } |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 803 | } |
| 804 | } |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 805 | elsif ($tag eq "br") |
| 806 | { print TEXI "\@\n"; } |
| 807 | elsif ($tag eq "body") |
| 808 | { } |
| 809 | elsif ($tag eq "center") |
| 810 | { if (has_single_content_string($he) |
| 811 | && ($ {$he->content}[0] =~ /^ *$/)) |
| 812 | { return 0; } |
| 813 | if ($startflag) |
| 814 | { print TEXI "\n\@center\n"; } |
| 815 | else |
| 816 | { print TEXI "\n\@end center\n"; } |
| 817 | } |
| 818 | elsif ($tag eq "div") |
| 819 | { my $align = $he->attr('align'); |
| 820 | if (defined($align) && ($align eq "center")) |
| 821 | { if (has_single_content_string($he) |
| 822 | && ($ {$he->content}[0] =~ /^ *$/)) |
| 823 | { return 0; } |
| 824 | if ($startflag) |
| 825 | { print TEXI "\n\@center\n"; } |
| 826 | else |
| 827 | { print TEXI "\n\@end center\n"; } } |
| 828 | } |
| 829 | elsif ($tag eq "dl") |
| 830 | { # Recognize "<dl><dd><pre> ... </pre></dl>" paradigm for "@example" |
| 831 | if (has_single_content_with_tag($he, "dd")) |
| 832 | { my $he_dd = $ {$he->content}[0]; |
| 833 | if (has_single_content_with_tag($he_dd, "pre")) |
| 834 | { my $he_pre = $ {$he_dd->content}[0]; |
| 835 | print_pre($he_pre); |
| 836 | return 0; } } |
| 837 | if ($startflag) |
| 838 | { # Could examine the elements, to be cleverer about formatting. |
| 839 | # (Also to use ftable, vtable...) |
| 840 | print TEXI "\n\@table \@asis\n"; } |
| 841 | else |
| 842 | { print TEXI "\n\@end table\n"; } |
| 843 | } |
| 844 | elsif ($tag eq "dt") |
| 845 | { push_or_pop_index_deferrers($tag, $startflag); |
| 846 | if ($startflag) |
| 847 | { print TEXI "\n\@item "; } |
| 848 | else |
| 849 | { } } |
| 850 | elsif ($tag eq "dd") |
| 851 | { if ($startflag) |
| 852 | { print TEXI "\n"; } |
| 853 | else |
| 854 | { } |
| 855 | if (scalar(@index_deferrers) != 0) |
| 856 | { $he->dump; |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 857 | die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; } |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 858 | do_deferred_index_entries(); |
| 859 | } |
| 860 | elsif ($tag =~ /^(font|big|small)$/) |
| 861 | { # Do nothing for now. |
| 862 | } |
| 863 | elsif ($tag =~ /^h[1-6]$/) |
| 864 | { # We don't need this because we never recursively enter the heading content. |
| 865 | # push_or_pop_index_deferrers($tag, $startflag); |
| 866 | my $secname = ""; |
| 867 | my @seclabels = (); |
| 868 | for my $elt (@{$he->content}) |
| 869 | { if (!ref $elt) |
| 870 | { $secname .= $elt; } |
| 871 | elsif ($elt->tag eq "br") |
| 872 | { } |
| 873 | elsif ($elt->tag eq "a") |
| 874 | { my ($name, $href, @acontent) = anchor_info($elt); |
| 875 | if ($href) |
| 876 | { $he->dump; |
| 877 | $elt->dump; |
| 878 | die "Nonsimple anchor in <$tag>"; } |
| 879 | if (!defined $name) |
| 880 | { die "No NAME for anchor in $tag"; } |
| 881 | push @seclabels, $name; |
| 882 | for my $subelt (@acontent) |
| 883 | { $secname .= html_to_texi($subelt); } } |
| 884 | else |
| 885 | { $secname .= html_to_texi($elt); } } |
| 886 | if ($secname eq "") |
| 887 | { die "No section name in <$tag>"; } |
| 888 | if (scalar(@section_stack) == 1) |
| 889 | { if ($section_stack[-1] ne "Top") |
| 890 | { die "Not top? $section_stack[-1]"; } |
| 891 | print TEXI "\@settitle $secname\n"; |
| 892 | print TEXI "\@c %**end of header\n"; |
| 893 | print TEXI "\n"; |
| 894 | print TEXI "\@node Top\n"; |
| 895 | print TEXI "\n"; } |
| 896 | else |
| 897 | { print TEXI "\n\@node $section_stack[-1]\n"; |
| 898 | print TEXI "\@$sectionmarker[scalar(@section_stack)-1] ", texi_remove_punctuation($secname), "\n"; } |
| 899 | for my $seclabel (@seclabels) |
| 900 | { label_add_index_entries($seclabel); } |
| 901 | # This should only happen once per file. |
| 902 | label_add_index_entries(""); |
| 903 | if (scalar(@index_deferrers) != 0) |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 904 | { $he->dump; |
| 905 | die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; } |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 906 | do_deferred_index_entries(); |
| 907 | return 0; |
| 908 | } |
| 909 | elsif ($tag eq "hr") |
| 910 | { } |
| 911 | elsif ($tag eq "ignore") |
| 912 | { # Hack for ignored elements |
| 913 | return 0; |
| 914 | } |
| 915 | elsif ($tag eq "li") |
| 916 | { if ($startflag) |
| 917 | { print TEXI "\n\n\@item\n"; |
| 918 | do_deferred_index_entries(); } } |
| 919 | elsif ($tag eq "ol") |
| 920 | { if ($startflag) |
| 921 | { print TEXI "\n\@enumerate \@bullet\n"; } |
| 922 | else |
| 923 | { print TEXI "\n\@end enumerate\n"; } } |
| 924 | elsif ($tag eq "p") |
| 925 | { if ($startflag) |
| 926 | { print TEXI "\n\n"; } |
| 927 | if (scalar(@index_deferrers) != 0) |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 928 | { $he->dump; |
| 929 | die "Unexpected <$tag> while inside: (" . join(" ", @index_deferrers) . "); bad HTML?"; } |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 930 | do_deferred_index_entries(); } |
| 931 | elsif ($tag eq "pre") |
| 932 | { print_pre($he); |
| 933 | return 0; } |
| 934 | elsif ($tag eq "table") |
| 935 | { # Could also indicate common formatting for first column, or |
| 936 | # determine relative widths for columns (or determine a prototype row) |
| 937 | if ($startflag) |
| 938 | { if (defined $table_columns) |
| 939 | { $he->dump; |
| 940 | die "Can't deal with table nested inside $table_columns-column table"; } |
| 941 | $table_columns = table_columns($he); |
| 942 | if ($table_columns < 2) |
| 943 | { $he->dump; |
| 944 | die "Column with $table_columns columns?"; } |
| 945 | elsif ($table_columns == 2) |
| 946 | { print TEXI "\n\@table \@asis\n"; } |
| 947 | else |
| 948 | { print TEXI "\n\@multitable \@columnfractions"; |
| 949 | for (my $i=0; $i<$table_columns; $i++) |
| 950 | { print TEXI " ", 1.0/$table_columns; } |
| 951 | print TEXI "\n"; } } |
| 952 | else |
| 953 | { if ($table_columns == 2) |
| 954 | { print TEXI "\n\@end table\n"; } |
| 955 | else |
| 956 | { print TEXI "\n\@end multitable\n"; } |
| 957 | undef $table_columns; } } |
| 958 | elsif (($tag eq "td") || ($tag eq "th")) |
| 959 | { if ($startflag) |
| 960 | { if ($table_first_column) |
| 961 | { print TEXI "\n\@item "; |
| 962 | $table_first_column = 0; } |
| 963 | elsif ($table_columns > 2) |
| 964 | { print TEXI "\n\@tab "; } } |
| 965 | else |
| 966 | { print TEXI "\n"; } } |
| 967 | elsif ($tag eq "tr") |
| 968 | { if ($startflag) |
| 969 | { $table_first_column = 1; } } |
| 970 | elsif ($tag eq "ul") |
| 971 | { if ($startflag) |
| 972 | { print TEXI "\n\@itemize \@bullet\n"; } |
| 973 | else |
| 974 | { print TEXI "\n\@end itemize\n"; } } |
| 975 | else |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 976 | { # I used to have a newline before "output_body" here. |
| 977 | print STDERR "output_body: ignoring <$tag> tag\n"; |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 978 | $he->dump; |
| 979 | return 0; } |
| 980 | |
| 981 | return 1; |
| 982 | } |
| 983 | |
| 984 | sub print_pre ( $ ) |
| 985 | { my ($he_pre) = check_args(1, @_); |
| 986 | if (!has_single_content_string($he_pre)) |
| 987 | { die "Multiple or non-string content for <PRE>: ", @{$he_pre->content}; } |
| 988 | my $pre_content = $ {$he_pre->content}[0]; |
| 989 | print TEXI "\n\@example"; |
| 990 | print TEXI &texi_quote($pre_content); |
| 991 | print TEXI "\@end example\n"; |
| 992 | } |
| 993 | |
| 994 | sub table_columns ( $ ) |
| 995 | { my ($table) = check_args(1, @_); |
| 996 | my $result = 0; |
| 997 | for my $row (@{$table->content}) |
| 998 | { if ($row->tag ne "tr") |
| 999 | { $table->dump; |
| 1000 | $row->dump; |
| 1001 | die "Expected <TR> as table row."; } |
| 1002 | $result = max($result, scalar(@{$row->content})); } |
| 1003 | return $result; |
| 1004 | } |
| 1005 | |
| 1006 | |
| 1007 | ########################################################################### |
| 1008 | ### Utilities |
| 1009 | ### |
| 1010 | |
| 1011 | sub min ( $$ ) |
| 1012 | { my ($x, $y) = check_args(2, @_); |
| 1013 | return ($x < $y) ? $x : $y; |
| 1014 | } |
| 1015 | |
| 1016 | sub max ( $$ ) |
| 1017 | { my ($x, $y) = check_args(2, @_); |
| 1018 | return ($x > $y) ? $x : $y; |
| 1019 | } |
| 1020 | |
| 1021 | sub file_to_tree ( $ ) |
| 1022 | { my ($file) = check_args(1, @_); |
| 1023 | |
| 1024 | my $tree = new HTML::TreeBuilder; |
| 1025 | $tree->ignore_unknown(1); |
| 1026 | # $tree->warn(1); |
| 1027 | $tree->parse_file($file); |
| 1028 | cleanup_parse_tree($tree); |
| 1029 | return $tree |
| 1030 | } |
| 1031 | |
| 1032 | |
| 1033 | sub has_single_content ( $ ) |
| 1034 | { my ($he) = check_args(1, @_); |
| 1035 | if (!ref $he) |
| 1036 | { # return 0; |
| 1037 | die "Non-reference argument: $he"; } |
| 1038 | my $ref_content = $he->content; |
| 1039 | if (!defined $ref_content) |
| 1040 | { return 0; } |
| 1041 | my @content = @{$ref_content}; |
| 1042 | if (scalar(@content) != 1) |
| 1043 | { return 0; } |
| 1044 | return 1; |
| 1045 | } |
| 1046 | |
| 1047 | |
| 1048 | # Return true if the content of the element contains only one element itself, |
| 1049 | # and that inner element has the specified tag. |
| 1050 | sub has_single_content_with_tag ( $$ ) |
| 1051 | { my ($he, $tag) = check_args(2, @_); |
| 1052 | if (!has_single_content($he)) |
| 1053 | { return 0; } |
| 1054 | my $content = $ {$he->content}[0]; |
| 1055 | if (!ref $content) |
| 1056 | { return 0; } |
| 1057 | my $content_tag = $content->tag; |
| 1058 | if (!defined $content_tag) |
| 1059 | { return 0; } |
| 1060 | return $content_tag eq $tag; |
| 1061 | } |
| 1062 | |
| 1063 | sub has_single_content_string ( $ ) |
| 1064 | { my ($he) = check_args(1, @_); |
| 1065 | if (!has_single_content($he)) |
| 1066 | { return 0; } |
| 1067 | my $content = $ {$he->content}[0]; |
| 1068 | if (ref $content) |
| 1069 | { return 0; } |
| 1070 | return 1; |
| 1071 | } |
| 1072 | |
| 1073 | |
| 1074 | # Return name, href, content. First two may be undefined; third is an array. |
| 1075 | # I don't see how to determine if there are more attributes. |
| 1076 | sub anchor_info ( $ ) |
| 1077 | { my ($he) = check_args(1, @_); |
| 1078 | if ($he->tag ne "a") |
| 1079 | { $he->dump; |
| 1080 | die "passed non-anchor to anchor_info"; } |
| 1081 | my $name = $he->attr('name'); |
| 1082 | my $href = $he->attr('href'); |
| 1083 | my @content = (); |
| 1084 | { my $ref_content = $he->content; |
| 1085 | if (defined $ref_content) |
| 1086 | { @content = @{$ref_content}; } } |
| 1087 | return ($name, $href, @content); |
| 1088 | } |
| 1089 | |
| 1090 | |
| 1091 | sub texi_quote ( $ ) |
| 1092 | { my ($text) = check_args(1, @_); |
| 1093 | $text =~ s/([\@\{\}])/\@$1/g; |
| 1094 | $text =~ s/ -- / --- /g; |
| 1095 | return $text; |
| 1096 | } |
| 1097 | |
| 1098 | # Eliminate bad punctuation (that confuses Makeinfo or Info) for section titles. |
| 1099 | sub texi_remove_punctuation ( $ ) |
| 1100 | { my ($text) = check_args(1, @_); |
| 1101 | |
| 1102 | $text =~ s/^ +//g; |
| 1103 | $text =~ s/[ :]+$//g; |
| 1104 | $text =~ s/^[1-9][0-9.]* +//g; |
| 1105 | $text =~ s/,//g; |
| 1106 | # Both embedded colons and " -- " confuse makeinfo. (Perhaps " -- " |
| 1107 | # gets converted into " - ", just as "---" would be converted into " -- ", |
| 1108 | # so the names end up differing.) |
| 1109 | # $text =~ s/:/ -- /g; |
| 1110 | $text =~ s/://g; |
| 1111 | return $text; |
| 1112 | } |
| 1113 | |
| 1114 | |
| 1115 | ## Do not use this inside `traverse': it throws off the traversal. Use |
| 1116 | ## html_replace_by_ignore or html_replace_by_meta instead. |
| 1117 | # Returns 1 if success, 0 if failure. |
| 1118 | sub html_remove ( $;$ ) |
| 1119 | { my ($he, $parent) = check_args_range(1, 2, @_); |
| 1120 | if (!defined $parent) |
| 1121 | { $parent = $he->parent; } |
| 1122 | my $ref_pcontent = $parent->content; |
| 1123 | my @pcontent = @{$ref_pcontent}; |
| 1124 | for (my $i=0; $i<scalar(@pcontent); $i++) |
| 1125 | { if ($pcontent[$i] eq $he) |
| 1126 | { splice @{$ref_pcontent}, $i, 1; |
| 1127 | $he->parent(undef); |
| 1128 | return 1; } } |
| 1129 | die "Didn't find $he in $parent"; |
| 1130 | } |
| 1131 | |
| 1132 | |
| 1133 | sub html_replace ( $$;$ ) |
| 1134 | { my ($orig, $new, $parent) = check_args_range(2, 3, @_); |
| 1135 | if (!defined $parent) |
| 1136 | { $parent = $orig->parent; } |
| 1137 | my $ref_pcontent = $parent->content; |
| 1138 | my @pcontent = @{$ref_pcontent}; |
| 1139 | for (my $i=0; $i<scalar(@pcontent); $i++) |
| 1140 | { if ($pcontent[$i] eq $orig) |
| 1141 | { $ {$ref_pcontent}[$i] = $new; |
| 1142 | $new->parent($parent); |
| 1143 | $orig->parent(undef); |
| 1144 | return 1; } } |
| 1145 | die "Didn't find $orig in $parent"; |
| 1146 | } |
| 1147 | |
| 1148 | sub html_replace_by_meta ( $;$ ) |
| 1149 | { my ($orig, $parent) = check_args_range(1, 2, @_); |
| 1150 | my $meta = new HTML::Element "meta"; |
| 1151 | if (!defined $parent) |
| 1152 | { $parent = $orig->parent; } |
| 1153 | return html_replace($orig, $meta, $parent); |
| 1154 | } |
| 1155 | |
| 1156 | sub html_replace_by_ignore ( $;$ ) |
| 1157 | { my ($orig, $parent) = check_args_range(1, 2, @_); |
| 1158 | my $ignore = new HTML::Element "ignore"; |
| 1159 | if (!defined $parent) |
| 1160 | { $parent = $orig->parent; } |
| 1161 | return html_replace($orig, $ignore, $parent); |
| 1162 | } |
| 1163 | |
| 1164 | |
| 1165 | |
| 1166 | ### |
| 1167 | ### Collect text elements |
| 1168 | ### |
| 1169 | |
| 1170 | my @collected_texts; |
| 1171 | my $collect_texts_stoppoint; |
| 1172 | my $done_collecting; |
| 1173 | |
| 1174 | sub collect_texts ( $;$ ) |
| 1175 | { my ($root, $stop) = check_args_range(1, 2, @_); |
| 1176 | # print STDERR "collect_texts: $root $stop\n"; |
| 1177 | $collect_texts_stoppoint = $stop; |
| 1178 | $done_collecting = 0; |
| 1179 | @collected_texts = (); |
| 1180 | $root->traverse(\&collect_if_text); # process texts |
| 1181 | # print STDERR "collect_texts => ", join(";;;", @collected_texts), "\n"; |
| 1182 | return @collected_texts; |
| 1183 | } |
| 1184 | |
| 1185 | sub collect_if_text ( $$$ ) |
| 1186 | { my $he = (check_args(3, @_))[0]; # ignore depth and startflag arguments |
| 1187 | if ($done_collecting) |
| 1188 | { return 0; } |
| 1189 | if (!defined $he) |
| 1190 | { return 0; } |
| 1191 | if (!ref $he) |
| 1192 | { push @collected_texts, $he; |
| 1193 | return 0; } |
| 1194 | if ((defined $collect_texts_stoppoint) && ($he eq $collect_texts_stoppoint)) |
| 1195 | { $done_collecting = 1; |
| 1196 | return 0; } |
| 1197 | return 1; |
| 1198 | } |
| 1199 | |
| 1200 | |
| 1201 | ########################################################################### |
| 1202 | ### Clean up parse tree |
| 1203 | ### |
| 1204 | |
| 1205 | sub cleanup_parse_tree ( $ ) |
| 1206 | { my ($he) = check_args(1, @_); |
| 1207 | $he->traverse(\&delete_if_navigation, 'ignore text'); |
| 1208 | $he->traverse(\&delete_extra_spaces, 'ignore text'); |
| 1209 | $he->traverse(\&merge_dl, 'ignore text'); |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 1210 | $he->traverse(\&reorder_dt_and_dl, 'ignore text'); |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 1211 | return $he; |
| 1212 | } |
| 1213 | |
| 1214 | |
| 1215 | ## Simpler version that deletes contents but not the element itself. |
| 1216 | # sub delete_if_navigation ( $$$ ) |
| 1217 | # { my $he = (check_args(3, @_))[0]; # ignore startflag and depth |
| 1218 | # if (($he->tag() eq "div") && ($he->attr('class') eq 'navigation')) |
| 1219 | # { $he->delete(); |
| 1220 | # return 0; } |
| 1221 | # else |
| 1222 | # { return 1; } |
| 1223 | # } |
| 1224 | |
| 1225 | sub delete_if_navigation ( $$$ ) |
| 1226 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument |
| 1227 | if (!$startflag) |
| 1228 | { return; } |
| 1229 | |
| 1230 | if (($he->tag() eq "div") && (defined $he->attr('class')) && ($he->attr('class') eq 'navigation')) |
| 1231 | { my $ref_pcontent = $he->parent()->content(); |
| 1232 | # Don't try to modify @pcontent, which appears to be a COPY. |
| 1233 | # my @pcontent = @{$ref_pcontent}; |
| 1234 | for (my $i = 0; $i<scalar(@{$ref_pcontent}); $i++) |
| 1235 | { if (${$ref_pcontent}[$i] eq $he) |
| 1236 | { splice(@{$ref_pcontent}, $i, 1); |
| 1237 | last; } } |
| 1238 | $he->delete(); |
| 1239 | return 0; } |
| 1240 | else |
| 1241 | { return 1; } |
| 1242 | } |
| 1243 | |
| 1244 | sub delete_extra_spaces ( $$$ ) |
| 1245 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument |
| 1246 | if (!$startflag) |
| 1247 | { return; } |
| 1248 | |
| 1249 | my $tag = $he->tag; |
| 1250 | if ($tag =~ /^(head|html|table|tr|ul)$/) |
| 1251 | { delete_child_spaces($he); } |
| 1252 | delete_trailing_spaces($he); |
| 1253 | return 1; |
| 1254 | } |
| 1255 | |
| 1256 | |
| 1257 | sub delete_child_spaces ( $ ) |
| 1258 | { my ($he) = check_args(1, @_); |
| 1259 | my $ref_content = $he->content(); |
| 1260 | for (my $i = 0; $i<scalar(@{$ref_content}); $i++) |
| 1261 | { if ($ {$ref_content}[$i] =~ /^ *$/) |
| 1262 | { splice(@{$ref_content}, $i, 1); |
| 1263 | $i--; } } |
| 1264 | } |
| 1265 | |
| 1266 | sub delete_trailing_spaces ( $ ) |
| 1267 | { my ($he) = check_args(1, @_); |
| 1268 | my $ref_content = $he->content(); |
| 1269 | if (! defined $ref_content) |
| 1270 | { return; } |
| 1271 | # Could also check for previous element = /^h[1-6]$/. |
| 1272 | for (my $i = 0; $i<scalar(@{$ref_content})-1; $i++) |
| 1273 | { if ($ {$ref_content}[$i] =~ /^ *$/) |
| 1274 | { my $next_elt = $ {$ref_content}[$i+1]; |
| 1275 | if ((ref $next_elt) && ($next_elt->tag =~ /^(br|dd|dl|dt|hr|p|ul)$/)) |
| 1276 | { splice(@{$ref_content}, $i, 1); |
| 1277 | $i--; } } } |
| 1278 | if ($he->tag =~ /^(dd|dt|^h[1-6]|li|p)$/) |
| 1279 | { my $last_elt = $ {$ref_content}[$#{$ref_content}]; |
| 1280 | if ((defined $last_elt) && ($last_elt =~ /^ *$/)) |
| 1281 | { pop @{$ref_content}; } } |
| 1282 | } |
| 1283 | |
| 1284 | |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 1285 | # LaTeX2HTML sometimes creates |
| 1286 | # <DT>text |
| 1287 | # <DL COMPACT><DD>text |
| 1288 | # which should actually be: |
| 1289 | # <DL COMPACT> |
| 1290 | # <DT>text |
| 1291 | # <DD>text |
| 1292 | # Since a <DL> gets added, this ends up looking like |
| 1293 | # <P> |
| 1294 | # <DL> |
| 1295 | # <DT> |
| 1296 | # text1... |
| 1297 | # <DL COMPACT> |
| 1298 | # <DD> |
| 1299 | # text2... |
| 1300 | # dt_or_dd1... |
| 1301 | # dt_or_dd2... |
| 1302 | # which should become |
| 1303 | # <P> |
| 1304 | # <DL COMPACT> |
| 1305 | # <DT> |
| 1306 | # text1... |
| 1307 | # <DD> |
| 1308 | # text2... |
| 1309 | # dt_or_dd1... |
| 1310 | # dt_or_dd2... |
| 1311 | |
| 1312 | sub reorder_dt_and_dl ( $$$ ) |
| 1313 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument |
| 1314 | if (!$startflag) |
| 1315 | { return; } |
| 1316 | |
| 1317 | if ($he->tag() eq "p") |
| 1318 | { my $ref_pcontent = $he->content(); |
| 1319 | if (defined $ref_pcontent) |
| 1320 | { my @pcontent = @{$ref_pcontent}; |
| 1321 | # print "reorder_dt_and_dl found a <p>\n"; $he->dump(); |
| 1322 | if ((scalar(@pcontent) >= 1) |
| 1323 | && (ref $pcontent[0]) && ($pcontent[0]->tag() eq "dl") |
| 1324 | && $pcontent[0]->implicit()) |
| 1325 | { my $ref_dlcontent = $pcontent[0]->content(); |
| 1326 | # print "reorder_dt_and_dl found a <p> and implicit <dl>\n"; |
| 1327 | if (defined $ref_dlcontent) |
| 1328 | { my @dlcontent = @{$ref_dlcontent}; |
| 1329 | if ((scalar(@dlcontent) >= 1) |
| 1330 | && (ref $dlcontent[0]) && ($dlcontent[0]->tag() eq "dt")) |
| 1331 | { my $ref_dtcontent = $dlcontent[0]->content(); |
| 1332 | # print "reorder_dt_and_dl found a <p>, implicit <dl>, and <dt>\n"; |
| 1333 | if (defined $ref_dtcontent) |
| 1334 | { my @dtcontent = @{$ref_dtcontent}; |
| 1335 | if ((scalar(@dtcontent) > 0) |
| 1336 | && (ref $dtcontent[$#dtcontent]) |
| 1337 | && ($dtcontent[$#dtcontent]->tag() eq "dl")) |
| 1338 | { my $ref_dl2content = $dtcontent[$#dtcontent]->content(); |
| 1339 | # print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, and <dl>\n"; |
| 1340 | if (defined $ref_dl2content) |
| 1341 | { my @dl2content = @{$ref_dl2content}; |
| 1342 | if ((scalar(@dl2content) > 0) |
| 1343 | && (ref ($dl2content[0])) |
| 1344 | && ($dl2content[0]->tag() eq "dd")) |
| 1345 | { |
| 1346 | # print "reorder_dt_and_dl found a <p>, implicit <dl>, <dt>, <dl>, and <dd>\n"; |
| 1347 | # print STDERR "CHANGING\n"; $he->dump(); |
| 1348 | html_replace_by_ignore($dtcontent[$#dtcontent]); |
| 1349 | splice(@{$ref_dlcontent}, 1, 0, @dl2content); |
| 1350 | # print STDERR "CHANGED TO:\n"; $he->dump(); |
| 1351 | return 0; # don't traverse children |
| 1352 | } } } } } } } } } |
| 1353 | return 1; |
| 1354 | } |
| 1355 | |
| 1356 | |
Fred Drake | 3fe1d32 | 1999-01-08 15:25:29 +0000 | [diff] [blame] | 1357 | # If we find a paragraph that looks like |
| 1358 | # <P> |
| 1359 | # <HR> |
| 1360 | # <UL> |
| 1361 | # then accumulate its links into a contents_list and delete the paragraph. |
| 1362 | sub process_if_child_links ( $$$ ) |
| 1363 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument |
| 1364 | if (!$startflag) |
| 1365 | { return; } |
| 1366 | |
| 1367 | if ($he->tag() eq "p") |
| 1368 | { my $ref_content = $he->content(); |
| 1369 | if (defined $ref_content) |
| 1370 | { my @content = @{$ref_content}; |
| 1371 | if ((scalar(@content) == 2) |
| 1372 | && (ref $content[0]) && $content[0]->tag() eq "hr" |
| 1373 | && (ref $content[1]) && $content[1]->tag() eq "ul") |
| 1374 | { process_child_links($he); |
| 1375 | $he->delete(); |
| 1376 | return 0; } } } |
| 1377 | return 1; |
| 1378 | } |
| 1379 | |
| 1380 | |
| 1381 | # If we find |
| 1382 | # <H4> |
| 1383 | # "Footnotes" |
| 1384 | # <DL> |
| 1385 | # <DT> |
| 1386 | # <A NAME="foot560"> |
| 1387 | # "...borrow" |
| 1388 | # <A HREF="refcountsInPython.html#tex2html2" NAME="foot560"> |
| 1389 | # "1.2" |
| 1390 | # <DD> |
| 1391 | # "The metaphor of ``borrowing'' a reference is not completely correct: the owner still has a copy of the reference. " |
| 1392 | # ... |
| 1393 | # then record the footnote information and delete the section and list. |
| 1394 | |
| 1395 | my $process_if_footnotes_expect_dl_next = 0; |
| 1396 | |
| 1397 | sub process_if_footnotes ( $$$ ) |
| 1398 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument |
| 1399 | if (!$startflag) |
| 1400 | { return; } |
| 1401 | |
| 1402 | if (($he->tag() eq "h4") |
| 1403 | && has_single_content_string($he) |
| 1404 | && ($ {$he->content}[0] eq "Footnotes")) |
| 1405 | { html_replace_by_ignore($he); |
| 1406 | $process_if_footnotes_expect_dl_next = 1; |
| 1407 | return 0; } |
| 1408 | |
| 1409 | if ($process_if_footnotes_expect_dl_next && ($he->tag() eq "dl")) |
| 1410 | { my $ref_content = $he->content(); |
| 1411 | if (defined $ref_content) |
| 1412 | { $process_if_footnotes_expect_dl_next = 0; |
| 1413 | my @content = @{$ref_content}; |
| 1414 | for (my $i=0; $i<$#content; $i+=2) |
| 1415 | { my $he_dt = $content[$i]; |
| 1416 | my $he_dd = $content[$i+1]; |
| 1417 | if (($he_dt->tag ne "dt") || ($he_dd->tag ne "dd")) |
| 1418 | { $he->dump; |
| 1419 | die "expected <DT> and <DD> at positions $i and ", $i+1; } |
| 1420 | my @dt_content = @{$he_dt->content()}; |
| 1421 | if ((scalar(@dt_content) != 2) |
| 1422 | || ($dt_content[0]->tag ne "a") |
| 1423 | || ($dt_content[1]->tag ne "a")) |
| 1424 | { $he_dt->dump; |
| 1425 | die "Expected 2 anchors as content of <DT>"; } |
| 1426 | my ($dt1_name, $dt1_href, $dt1_content) = anchor_info($dt_content[0]); |
| 1427 | my ($dt2_name, $dt2_href, $dt2_content) = anchor_info($dt_content[0]); |
| 1428 | # unused: $dt1_href, $dt1_content, $dt2_href, $dt2_content |
| 1429 | if ($dt1_name ne $dt2_name) |
| 1430 | { $he_dt->dump; |
| 1431 | die "Expected identical names for anchors"; } |
| 1432 | html_replace_by_ignore($he_dd); |
| 1433 | $he_dd->tag("div"); # has no effect |
| 1434 | $footnotes{$dt1_name} = $he_dd; } |
| 1435 | html_replace_by_ignore($he); |
| 1436 | return 0; } } |
| 1437 | |
| 1438 | if ($process_if_footnotes_expect_dl_next) |
| 1439 | { $he->dump; |
| 1440 | die "Expected <DL> for footnotes next"; } |
| 1441 | |
| 1442 | return 1; |
| 1443 | } |
| 1444 | |
| 1445 | |
| 1446 | |
| 1447 | ## Merge two adjacent paragraphs containing <DL> items, such as: |
| 1448 | # <P> |
| 1449 | # <DL> |
| 1450 | # <DT> |
| 1451 | # ... |
| 1452 | # <DD> |
| 1453 | # ... |
| 1454 | # <P> |
| 1455 | # <DL> |
| 1456 | # <DT> |
| 1457 | # ... |
| 1458 | # <DD> |
| 1459 | # ... |
| 1460 | |
| 1461 | sub merge_dl ( $$$ ) |
| 1462 | { my ($he, $startflag) = (check_args(3, @_))[0,1]; # ignore depth argument |
| 1463 | if (!$startflag) |
| 1464 | { return; } |
| 1465 | |
| 1466 | my $ref_content = $he->content; |
| 1467 | if (!defined $ref_content) |
| 1468 | { return; } |
| 1469 | my $i = 0; |
| 1470 | while ($i < scalar(@{$ref_content})-1) |
| 1471 | { my $p1 = $ {$ref_content}[$i]; |
| 1472 | if ((ref $p1) && ($p1->tag eq "p") |
| 1473 | && has_single_content_with_tag($p1, "dl")) |
| 1474 | { my $dl1 = $ {$p1->content}[0]; |
| 1475 | # In this loop, rhs, not lhs, of < comparison changes, |
| 1476 | # because we are removing elements from the content of $he. |
| 1477 | while ($i < scalar(@{$ref_content})-1) |
| 1478 | { my $p2 = $ {$ref_content}[$i+1]; |
| 1479 | if (!((ref $p2) && ($p2->tag eq "p") |
| 1480 | && has_single_content_with_tag($p2, "dl"))) |
| 1481 | { last; } |
| 1482 | # Merge these two elements. |
| 1483 | splice(@{$ref_content}, $i+1, 1); # remove $p2 |
| 1484 | my $dl2 = $ {$p2->content}[0]; |
| 1485 | $dl1->push_content(@{$dl2->content}); # put $dl2's content in $dl1 |
| 1486 | } |
| 1487 | # extra increment because next element isn't a candidate for $p1 |
| 1488 | $i++; } |
| 1489 | $i++; } |
| 1490 | return 1; |
| 1491 | } |
| 1492 | |
| 1493 | |
| 1494 | |
| 1495 | ########################################################################### |
| 1496 | ### Testing |
| 1497 | ### |
| 1498 | |
| 1499 | sub test ( $$ ) |
| 1500 | { my ($action, $file) = check_args(2, @_); |
| 1501 | |
| 1502 | # General testing |
| 1503 | if (($action eq "view") || ($action eq "")) |
| 1504 | { # # $file = "/homes/gws/mernst/www/links.html"; |
| 1505 | # # $file = "/homes/gws/mernst/www/index.html"; |
| 1506 | # # $file = "/homes/fish/mernst/java/gud/doc/manual.html"; |
| 1507 | # # $file = "/projects/cecil/cecil/doc/manuals/stdlib-man/stdlib/stdlib.html"; |
| 1508 | # # $file = "/homes/fish/mernst/tmp/python-doc/html/index.html"; |
| 1509 | # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html"; |
| 1510 | my $tree = file_to_tree($file); |
| 1511 | |
| 1512 | ## Testing |
| 1513 | # print STDERR $tree->as_HTML; |
| 1514 | $tree->dump(); |
| 1515 | |
| 1516 | # print STDERR $tree->tag(), "\n"; |
| 1517 | # print STDERR @{$tree->content()}, "\n"; |
| 1518 | # |
| 1519 | # for (@{ $tree->extract_links(qw(a img)) }) { |
| 1520 | # my ($link, $linkelem) = @$_; |
| 1521 | # print STDERR "$link ", $linkelem->as_HTML; |
| 1522 | # } |
| 1523 | # |
| 1524 | # print STDERR @{$tree->extract_links()}, "\n"; |
| 1525 | |
| 1526 | # my @top_level_elts = @{$tree->content()}; |
| 1527 | |
| 1528 | # if scalar(@{$tree->content()}) |
| 1529 | return; |
| 1530 | } |
| 1531 | |
| 1532 | elsif ($action eq "raw") |
| 1533 | { my $tree = new HTML::TreeBuilder; |
| 1534 | $tree->ignore_unknown(1); |
| 1535 | # $tree->warn(1); |
| 1536 | $tree->parse_file($file); |
| 1537 | |
| 1538 | $tree->dump(); |
| 1539 | |
| 1540 | # cleanup_parse_tree($tree); |
| 1541 | # $tree->dump(); |
| 1542 | return; |
| 1543 | } |
| 1544 | |
| 1545 | # Test dealing with a section. |
| 1546 | elsif ($action eq "section") |
| 1547 | { # my $file; |
| 1548 | # $file = "/homes/fish/mernst/tmp/python-doc/html/api/intro.html"; |
| 1549 | # $file = "/homes/fish/mernst/tmp/python-doc/html/api/includes.html"; |
| 1550 | # $file = "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html"; |
| 1551 | process_section_file($file, 0, "Title"); |
| 1552 | } |
| 1553 | |
| 1554 | # Test dealing with many sections |
| 1555 | elsif (0) |
| 1556 | { my @files = ("/homes/fish/mernst/tmp/python-doc/html/api/about.html", |
| 1557 | "/homes/fish/mernst/tmp/python-doc/html/api/abstract.html", |
| 1558 | "/homes/fish/mernst/tmp/python-doc/html/api/api.html", |
| 1559 | "/homes/fish/mernst/tmp/python-doc/html/api/cObjects.html", |
| 1560 | "/homes/fish/mernst/tmp/python-doc/html/api/complexObjects.html", |
| 1561 | "/homes/fish/mernst/tmp/python-doc/html/api/concrete.html", |
| 1562 | # "/homes/fish/mernst/tmp/python-doc/html/api/contents.html", |
| 1563 | "/homes/fish/mernst/tmp/python-doc/html/api/countingRefs.html", |
| 1564 | "/homes/fish/mernst/tmp/python-doc/html/api/debugging.html", |
| 1565 | "/homes/fish/mernst/tmp/python-doc/html/api/dictObjects.html", |
| 1566 | "/homes/fish/mernst/tmp/python-doc/html/api/embedding.html", |
| 1567 | "/homes/fish/mernst/tmp/python-doc/html/api/exceptionHandling.html", |
| 1568 | "/homes/fish/mernst/tmp/python-doc/html/api/exceptions.html", |
| 1569 | "/homes/fish/mernst/tmp/python-doc/html/api/fileObjects.html", |
| 1570 | "/homes/fish/mernst/tmp/python-doc/html/api/floatObjects.html", |
| 1571 | "/homes/fish/mernst/tmp/python-doc/html/api/front.html", |
| 1572 | "/homes/fish/mernst/tmp/python-doc/html/api/fundamental.html", |
| 1573 | # "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html", |
| 1574 | "/homes/fish/mernst/tmp/python-doc/html/api/importing.html", |
| 1575 | "/homes/fish/mernst/tmp/python-doc/html/api/includes.html", |
| 1576 | "/homes/fish/mernst/tmp/python-doc/html/api/index.html", |
| 1577 | "/homes/fish/mernst/tmp/python-doc/html/api/initialization.html", |
| 1578 | "/homes/fish/mernst/tmp/python-doc/html/api/intObjects.html", |
| 1579 | "/homes/fish/mernst/tmp/python-doc/html/api/intro.html", |
| 1580 | "/homes/fish/mernst/tmp/python-doc/html/api/listObjects.html", |
| 1581 | "/homes/fish/mernst/tmp/python-doc/html/api/longObjects.html", |
| 1582 | "/homes/fish/mernst/tmp/python-doc/html/api/mapObjects.html", |
| 1583 | "/homes/fish/mernst/tmp/python-doc/html/api/mapping.html", |
| 1584 | "/homes/fish/mernst/tmp/python-doc/html/api/newTypes.html", |
| 1585 | "/homes/fish/mernst/tmp/python-doc/html/api/node24.html", |
| 1586 | "/homes/fish/mernst/tmp/python-doc/html/api/noneObject.html", |
| 1587 | "/homes/fish/mernst/tmp/python-doc/html/api/number.html", |
| 1588 | "/homes/fish/mernst/tmp/python-doc/html/api/numericObjects.html", |
| 1589 | "/homes/fish/mernst/tmp/python-doc/html/api/object.html", |
| 1590 | "/homes/fish/mernst/tmp/python-doc/html/api/objects.html", |
| 1591 | "/homes/fish/mernst/tmp/python-doc/html/api/os.html", |
| 1592 | "/homes/fish/mernst/tmp/python-doc/html/api/otherObjects.html", |
| 1593 | "/homes/fish/mernst/tmp/python-doc/html/api/processControl.html", |
| 1594 | "/homes/fish/mernst/tmp/python-doc/html/api/refcountDetails.html", |
| 1595 | "/homes/fish/mernst/tmp/python-doc/html/api/refcounts.html", |
| 1596 | "/homes/fish/mernst/tmp/python-doc/html/api/sequence.html", |
| 1597 | "/homes/fish/mernst/tmp/python-doc/html/api/sequenceObjects.html", |
| 1598 | "/homes/fish/mernst/tmp/python-doc/html/api/standardExceptions.html", |
| 1599 | "/homes/fish/mernst/tmp/python-doc/html/api/stringObjects.html", |
| 1600 | "/homes/fish/mernst/tmp/python-doc/html/api/threads.html", |
| 1601 | "/homes/fish/mernst/tmp/python-doc/html/api/tupleObjects.html", |
| 1602 | "/homes/fish/mernst/tmp/python-doc/html/api/typeObjects.html", |
| 1603 | "/homes/fish/mernst/tmp/python-doc/html/api/types.html", |
| 1604 | "/homes/fish/mernst/tmp/python-doc/html/api/utilities.html", |
| 1605 | "/homes/fish/mernst/tmp/python-doc/html/api/veryhigh.html"); |
| 1606 | for my $file (@files) |
| 1607 | { print STDERR "\n", "=" x 75, "\n", "$file:\n"; |
| 1608 | process_section_file($file, 0, "Title"); |
| 1609 | } |
| 1610 | } |
| 1611 | |
| 1612 | # Test dealing with index. |
| 1613 | elsif ($action eq "index") |
| 1614 | { # my $file; |
| 1615 | # $file = "/homes/fish/mernst/tmp/python-doc/html/api/genindex.html"; |
| 1616 | |
| 1617 | process_index_file($file, "\@cindex"); |
| 1618 | print_index_info(); |
| 1619 | } |
| 1620 | |
| 1621 | else |
| 1622 | { die "Unrecognized action `$action'"; } |
| 1623 | } |
| 1624 | |
| 1625 | |
| 1626 | ########################################################################### |
| 1627 | ### Main loop |
| 1628 | ### |
| 1629 | |
| 1630 | sub process_contents_file ( $ ) |
| 1631 | { my ($file) = check_args(1, @_); |
| 1632 | |
| 1633 | # could also use File::Basename |
| 1634 | my $info_file = $file; |
| 1635 | $info_file =~ s/(\/?index)?\.html$//; |
| 1636 | if ($info_file eq "") |
| 1637 | { chomp($info_file = `pwd`); } |
| 1638 | $info_file =~ s/^.*\///; # not the most efficient way to remove dirs |
| 1639 | |
| 1640 | $html_directory = $file; |
| 1641 | $html_directory =~ s/(\/|^)[^\/]+$/$1/; |
| 1642 | |
| 1643 | my $texi_file = "$info_file.texi"; |
| 1644 | open(TEXI, ">$texi_file"); |
| 1645 | |
| 1646 | print TEXI "\\input texinfo \@c -*-texinfo-*-\n"; |
| 1647 | print TEXI "\@c %**start of header\n"; |
| 1648 | print TEXI "\@setfilename $info_file\n"; |
| 1649 | |
| 1650 | # 2. Summary Description and Copyright |
| 1651 | # The "Summary Description and Copyright" segment describes the |
| 1652 | # document and contains the copyright notice and copying permissions |
| 1653 | # for the Info file. The segment must be enclosed between `@ifinfo' |
| 1654 | # and `@end ifinfo' commands so that the formatters place it only in |
| 1655 | # the Info file. |
| 1656 | # |
| 1657 | # The summary description and copyright segment does not appear in the |
| 1658 | # printed document. |
| 1659 | # |
| 1660 | # @ifinfo |
| 1661 | # This is a short example of a complete Texinfo file. |
| 1662 | # |
| 1663 | # Copyright @copyright{} 1990 Free Software Foundation, Inc. |
| 1664 | # @end ifinfo |
| 1665 | |
| 1666 | |
| 1667 | # 3. Title and Copyright |
| 1668 | # The "Title and Copyright" segment contains the title and copyright |
| 1669 | # pages and copying permissions for the printed manual. The segment |
| 1670 | # must be enclosed between `@titlepage' and `@end titlepage' |
| 1671 | # commands. The title and copyright page appear only in the printed |
| 1672 | # manual. |
| 1673 | # |
| 1674 | # The titlepage segment does not appear in the Info file. |
| 1675 | # |
| 1676 | # @titlepage |
| 1677 | # @sp 10 |
| 1678 | # @comment The title is printed in a large font. |
| 1679 | # @center @titlefont{Sample Title} |
| 1680 | # |
| 1681 | # @c The following two commands start the copyright page. |
| 1682 | # @page |
| 1683 | # @vskip 0pt plus 1filll |
| 1684 | # Copyright @copyright{} 1990 Free Software Foundation, Inc. |
| 1685 | # @end titlepage |
| 1686 | |
| 1687 | |
| 1688 | # 4. `Top' Node and Master Menu |
| 1689 | # The "Master Menu" contains a complete menu of all the nodes in the |
| 1690 | # whole Info file. It appears only in the Info file, in the `Top' |
| 1691 | # node. |
| 1692 | # |
| 1693 | # The `Top' node contains the master menu for the Info file. Since a |
| 1694 | # printed manual uses a table of contents rather than a menu, the master |
| 1695 | # menu appears only in the Info file. |
| 1696 | # |
| 1697 | # @node Top, First Chapter, , (dir) |
| 1698 | # @comment node-name, next, previous, up |
| 1699 | # |
| 1700 | # @menu |
| 1701 | # * First Chapter:: The first chapter is the |
| 1702 | # only chapter in this sample. |
| 1703 | # * Concept Index:: This index has two entries. |
| 1704 | # @end menu |
| 1705 | |
| 1706 | |
| 1707 | |
| 1708 | $current_ref_tdf = [ "Top", 0, $ARGV[0] ]; |
| 1709 | process_section_file($file, 0, "Top"); |
| 1710 | while (scalar(@contents_list)) |
| 1711 | { $current_ref_tdf = shift @contents_list; |
| 1712 | process_section_file($ {$current_ref_tdf}[2], $ {$current_ref_tdf}[1], $ {$current_ref_tdf}[0]); |
| 1713 | } |
| 1714 | |
| 1715 | print TEXI "\n"; |
| 1716 | for my $indextitle (@index_titles) |
| 1717 | { print TEXI "\@node $indextitle\n"; |
| 1718 | print TEXI "\@unnumbered $indextitle\n"; |
| 1719 | print TEXI "\@printindex $ {$index_info{$indextitle}}[1]\n"; |
| 1720 | print TEXI "\n"; } |
| 1721 | |
| 1722 | print TEXI "\@contents\n"; |
| 1723 | print TEXI "\@bye\n"; |
| 1724 | close(TEXI); |
| 1725 | } |
| 1726 | |
| 1727 | # This needs to be last so global variable initializations are reached. |
| 1728 | |
| 1729 | if (scalar(@ARGV) == 0) |
| 1730 | { die "No arguments supplied to html2texi.pl"; } |
| 1731 | |
| 1732 | if ($ARGV[0] eq "-test") |
| 1733 | { my @test_args = @ARGV[1..$#ARGV]; |
| 1734 | if (scalar(@test_args) == 0) |
| 1735 | { test("", "index.html"); } |
| 1736 | elsif (scalar(@test_args) == 1) |
| 1737 | { test("", $test_args[0]); } |
| 1738 | elsif (scalar(@test_args) == 2) |
| 1739 | { test($test_args[0], $test_args[1]); } |
| 1740 | else |
| 1741 | { die "Too many test arguments passed to html2texi: ", join(" ", @ARGV); } |
| 1742 | exit(); |
| 1743 | } |
| 1744 | |
| 1745 | if (scalar(@ARGV) != 1) |
| 1746 | { die "Pass one argument, the main/contents page"; } |
| 1747 | |
| 1748 | process_contents_file($ARGV[0]); |
Fred Drake | 54bad44 | 1999-01-14 18:17:07 +0000 | [diff] [blame] | 1749 | |
| 1750 | # end of html2texi.pl |