blob: 1bd62ba24a00acdd53883b3b33d278b21207e13a [file] [log] [blame]
Nick Kralevichf73ff172014-09-27 12:41:49 -07001#! /usr/bin/perl -w
2
Janis Danisevskis112c9cc2016-03-31 13:35:25 +01003# Script to turn PCRE2 man pages into HTML
Nick Kralevichf73ff172014-09-27 12:41:49 -07004
5
6# Subroutine to handle font changes and other escapes
7
8sub do_line {
9my($s) = $_[0];
10
11$s =~ s/</&#60;/g; # Deal with < and >
12$s =~ s/>/&#62;/g;
13$s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g;
14$s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g;
15$s =~ s"\\e"\\"g;
16$s =~ s/(?<=Copyright )\(c\)/&copy;/g;
17$s;
18}
19
20# Subroutine to ensure not in a paragraph
21
22sub end_para {
23if ($inpara)
24 {
25 print TEMP "</PRE>\n" if ($inpre);
26 print TEMP "</P>\n";
27 }
28$inpara = $inpre = 0;
29$wrotetext = 0;
30}
31
32# Subroutine to start a new paragraph
33
34sub new_para {
35&end_para();
36print TEMP "<P>\n";
37$inpara = 1;
38}
39
40
41# Main program
42
43$innf = 0;
44$inpara = 0;
45$inpre = 0;
46$wrotetext = 0;
47$toc = 0;
48$ref = 1;
49
50while ($#ARGV >= 0 && $ARGV[0] =~ /^-/)
51 {
52 $toc = 1 if $ARGV[0] eq "-toc";
53 shift;
54 }
55
56# Initial output to STDOUT
57
58print <<End ;
59<html>
60<head>
61<title>$ARGV[0] specification</title>
62</head>
63<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
64<h1>$ARGV[0] man page</h1>
65<p>
Janis Danisevskis112c9cc2016-03-31 13:35:25 +010066Return to the <a href="index.html">PCRE2 index page</a>.
Nick Kralevichf73ff172014-09-27 12:41:49 -070067</p>
68<p>
Janis Danisevskis112c9cc2016-03-31 13:35:25 +010069This page is part of the PCRE2 HTML documentation. It was generated
70automatically from the original man page. If there is any nonsense in it,
71please consult the man page, in case the conversion went wrong.
Nick Kralevichf73ff172014-09-27 12:41:49 -070072<br>
73End
74
75print "<ul>\n" if ($toc);
76
77open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n";
78
79while (<STDIN>)
80 {
81 # Handle lines beginning with a dot
82
83 if (/^\./)
84 {
Janis Danisevskis112c9cc2016-03-31 13:35:25 +010085 # Some of the PCRE2 man pages used to contain instances of .br. However,
Nick Kralevichf73ff172014-09-27 12:41:49 -070086 # they should have all been removed because they cause trouble in some
87 # (other) automated systems that translate man pages to HTML. Complain if
88 # we find .br or .in (another macro that is deprecated).
89
90 if (/^\.br/ || /^\.in/)
91 {
92 print STDERR "\n*** Deprecated macro encountered - rewrite needed\n";
93 print STDERR "*** $_\n";
94 die "*** Processing abandoned\n";
95 }
96
97 # Instead of .br, relevent "literal" sections are enclosed in .nf/.fi.
98
99 elsif (/^\.nf/)
100 {
101 $innf = 1;
102 }
103
104 elsif (/^\.fi/)
105 {
106 $innf = 0;
107 }
108
109 # Handling .sp is subtle. If it is inside a literal section, do nothing if
110 # the next line is a non literal text line; similarly, if not inside a
111 # literal section, do nothing if a literal follows, unless we are inside
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700112 # a .nf/.fi section or about to enter one. The point being that the <pre>
113 # and </pre> that delimit literal sections will do the spacing. Always skip
114 # if no previous output.
Nick Kralevichf73ff172014-09-27 12:41:49 -0700115
116 elsif (/^\.sp/)
117 {
118 if ($wrotetext)
119 {
120 $_ = <STDIN>;
121 if ($inpre)
122 {
123 print TEMP "\n" if (/^[\s.]/);
124 }
125 else
126 {
Elliott Hughes9bc971b2018-07-27 13:23:14 -0700127 print TEMP "<br>\n<br>\n" if ($innf || /^\.nf/ || !/^[\s.]/);
Nick Kralevichf73ff172014-09-27 12:41:49 -0700128 }
129 redo; # Now process the lookahead line we just read
130 }
131 }
132 elsif (/^\.TP/ || /^\.PP/ || /^\.P/)
133 {
134 &new_para();
135 }
136 elsif (/^\.SH\s*("?)(.*)\1/)
137 {
138 # Ignore the NAME section
139 if ($2 =~ /^NAME\b/)
140 {
141 <STDIN>;
142 next;
143 }
144
145 &end_para();
146 my($title) = &do_line($2);
147 if ($toc)
148 {
149 printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n",
150 $ref, $ref);
151 printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n",
Elliott Hughes0ea98832015-12-04 23:18:20 -0800152 $ref);
Nick Kralevichf73ff172014-09-27 12:41:49 -0700153 $ref++;
154 }
155 else
156 {
157 print TEMP "<br><b>\n$title\n</b><br>\n";
158 }
159 }
160 elsif (/^\.SS\s*("?)(.*)\1/)
161 {
162 &end_para();
163 my($title) = &do_line($2);
164 print TEMP "<br><b>\n$title\n</b><br>\n";
165 }
166 elsif (/^\.B\s*(.*)/)
167 {
168 &new_para() if (!$inpara);
169 $_ = &do_line($1);
170 s/"(.*?)"/$1/g;
171 print TEMP "<b>$_</b>\n";
172 $wrotetext = 1;
173 }
174 elsif (/^\.I\s*(.*)/)
175 {
176 &new_para() if (!$inpara);
177 $_ = &do_line($1);
178 s/"(.*?)"/$1/g;
179 print TEMP "<i>$_</i>\n";
180 $wrotetext = 1;
181 }
182
183 # A comment that starts "HREF" takes the next line as a name that
184 # is turned into a hyperlink, using the text given, which might be
185 # in a special font. If it ends in () or (digits) or punctuation, they
186 # aren't part of the link.
187
188 elsif (/^\.\\"\s*HREF/)
189 {
190 $_=<STDIN>;
191 chomp;
192 $_ = &do_line($_);
193 $_ =~ s/\s+$//;
194 $_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/;
195 print TEMP "<a href=\"$1.html\">$_</a>\n";
196 }
197
198 # A comment that starts "HTML" inserts literal HTML
199
200 elsif (/^\.\\"\s*HTML\s*(.*)/)
201 {
202 print TEMP $1;
203 }
204
205 # A comment that starts < inserts that HTML at the end of the
206 # *next* input line - so as not to get a newline between them.
207
208 elsif (/^\.\\"\s*(<.*>)/)
209 {
210 my($markup) = $1;
211 $_=<STDIN>;
212 chomp;
213 $_ = &do_line($_);
214 $_ =~ s/\s+$//;
215 print TEMP "$_$markup\n";
216 }
217
218 # A comment that starts JOIN joins the next two lines together, with one
219 # space between them. Then that line is processed. This is used in some
220 # displays where two lines are needed for the "man" version. JOINSH works
221 # the same, except that it assumes this is a shell command, so removes
222 # continuation backslashes.
223
224 elsif (/^\.\\"\s*JOIN(SH)?/)
225 {
226 my($one,$two);
227 $one = <STDIN>;
228 $two = <STDIN>;
229 $one =~ s/\s*\\e\s*$// if (defined($1));
230 chomp($one);
231 $two =~ s/^\s+//;
232 $_ = "$one $two";
233 redo; # Process the joined lines
234 }
235
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100236 # .EX/.EE are used in the pcre2demo page to bracket the entire program,
Nick Kralevichf73ff172014-09-27 12:41:49 -0700237 # which is unmodified except for turning backslash into "\e".
238
239 elsif (/^\.EX\s*$/)
240 {
241 print TEMP "<PRE>\n";
242 while (<STDIN>)
243 {
244 last if /^\.EE\s*$/;
245 s/\\e/\\/g;
246 s/&/&amp;/g;
247 s/</&lt;/g;
248 s/>/&gt;/g;
249 print TEMP;
250 }
251 }
252
253 # Ignore anything not recognized
254
255 next;
256 }
257
258 # Line does not begin with a dot. Replace blank lines with new paragraphs
259
260 if (/^\s*$/)
261 {
262 &end_para() if ($wrotetext);
263 next;
264 }
265
266 # Convert fonts changes and output an ordinary line. Ensure that indented
267 # lines are marked as literal.
268
269 $_ = &do_line($_);
270 &new_para() if (!$inpara);
271
272 if (/^\s/)
273 {
274 if (!$inpre)
275 {
276 print TEMP "<pre>\n";
277 $inpre = 1;
278 }
279 }
280 elsif ($inpre)
281 {
282 print TEMP "</pre>\n";
283 $inpre = 0;
284 }
285
286 # Add <br> to the end of a non-literal line if we are within .nf/.fi
287
288 $_ .= "<br>\n" if (!$inpre && $innf);
289
290 print TEMP;
291 $wrotetext = 1;
292 }
293
294# The TOC, if present, will have been written - terminate it
295
296print "</ul>\n" if ($toc);
297
298# Copy the remainder to the standard output
299
300close(TEMP);
301open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n";
302
303print while (<TEMP>);
304
305print <<End ;
306<p>
Janis Danisevskis112c9cc2016-03-31 13:35:25 +0100307Return to the <a href="index.html">PCRE2 index page</a>.
Nick Kralevichf73ff172014-09-27 12:41:49 -0700308</p>
309End
310
311close(TEMP);
312unlink("/tmp/$$");
313
314# End