blob: 08692b3a4b027863b23da63a556fb8b901adce1a [file] [log] [blame]
Raymond Hettinger71e00332003-05-10 03:30:13 +00001""" TeXcheck.py -- rough syntax checking on Python style LaTeX documents.
2
3 Written by Raymond D. Hettinger <python at rcn.com>
4 Copyright (c) 2003 Python Software Foundation. All rights reserved.
5
6Designed to catch common markup errors including:
7* Unbalanced or mismatched parenthesis, brackets, and braces.
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +00008* Unbalanced or mismatched \\begin and \\end blocks.
Raymond Hettinger71e00332003-05-10 03:30:13 +00009* Misspelled or invalid LaTeX commands.
10* Use of forward slashes instead of backslashes for commands.
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000011* Table line size mismatches.
Raymond Hettinger71e00332003-05-10 03:30:13 +000012
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000013Sample command line usage:
14 python texcheck.py -k chapterheading -m lib/librandomtex *.tex
Raymond Hettinger71e00332003-05-10 03:30:13 +000015
16Options:
Raymond Hettinger62aa9942003-05-12 23:33:28 +000017 -m Munge parenthesis and brackets. [0,n) would normally mismatch.
18 -k keyword: Keyword is a valid LaTeX command. Do not include the backslash.
Raymond Hettinger71e00332003-05-10 03:30:13 +000019 -d: Delimiter check only (useful for non-LaTeX files).
20 -h: Help
21 -s lineno: Start at lineno (useful for skipping complex sections).
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000022 -v: Verbose. Trace the matching of //begin and //end blocks.
Raymond Hettinger71e00332003-05-10 03:30:13 +000023"""
24
Raymond Hettinger71e00332003-05-10 03:30:13 +000025import re
Raymond Hettinger71e00332003-05-10 03:30:13 +000026import sys
27import getopt
28from itertools import izip, count, islice
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000029import glob
Raymond Hettinger71e00332003-05-10 03:30:13 +000030
31cmdstr = r"""
32 \section \module \declaremodule \modulesynopsis \moduleauthor
33 \sectionauthor \versionadded \code \class \method \begin
34 \optional \var \ref \end \subsection \lineiii \hline \label
35 \indexii \textrm \ldots \keyword \stindex \index \item \note
36 \withsubitem \ttindex \footnote \citetitle \samp \opindex
37 \noindent \exception \strong \dfn \ctype \obindex \character
38 \indexiii \function \bifuncindex \refmodule \refbimodindex
39 \subsubsection \nodename \member \chapter \emph \ASCII \UNIX
40 \regexp \program \production \token \productioncont \term
41 \grammartoken \lineii \seemodule \file \EOF \documentclass
42 \usepackage \title \input \maketitle \ifhtml \fi \url \Cpp
43 \tableofcontents \kbd \programopt \envvar \refstmodindex
44 \cfunction \constant \NULL \moreargs \cfuncline \cdata
45 \textasciicircum \n \ABC \setindexsubitem \versionchanged
46 \deprecated \seetext \newcommand \POSIX \pep \warning \rfc
47 \verbatiminput \methodline \textgreater \seetitle \lineiv
48 \funclineni \ulink \manpage \funcline \dataline \unspecified
49 \textbackslash \mimetype \mailheader \seepep \textunderscore
50 \longprogramopt \infinity \plusminus \shortversion \version
51 \refmodindex \seerfc \makeindex \makemodindex \renewcommand
Raymond Hettinger0fd525f2003-05-10 07:41:55 +000052 \indexname \appendix \protect \indexiv \mbox \textasciitilde
53 \platform \seeurl \leftmargin \labelwidth \localmoduletable
Raymond Hettinger62aa9942003-05-12 23:33:28 +000054 \LaTeX \copyright \memberline \backslash \pi \centerline
55 \caption \vspace \textwidth \menuselection \textless
56 \makevar \csimplemacro \menuselection \bfcode \sub \release
57 \email \kwindex \refexmodindex \filenq \e \menuselection
58 \exindex \linev \newsgroup \verbatim \setshortversion
Raymond Hettinger5492f3d2004-07-12 13:16:49 +000059 \author \authoraddress \paragraph \subparagraph \cmemberline
Raymond Hettinger71e00332003-05-10 03:30:13 +000060"""
61
62def matchclose(c_lineno, c_symbol, openers, pairmap):
63 "Verify that closing delimiter matches most recent opening delimiter"
64 try:
65 o_lineno, o_symbol = openers.pop()
66 except IndexError:
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000067 print "\nDelimiter mismatch. On line %d, encountered closing '%s' without corresponding open" % (c_lineno, c_symbol)
68 return
Raymond Hettinger71e00332003-05-10 03:30:13 +000069 if o_symbol in pairmap.get(c_symbol, [c_symbol]): return
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000070 print "\nOpener '%s' on line %d was not closed before encountering '%s' on line %d" % (o_symbol, o_lineno, c_symbol, c_lineno)
71 return
Raymond Hettinger71e00332003-05-10 03:30:13 +000072
73def checkit(source, opts, morecmds=[]):
Raymond Hettinger0fd525f2003-05-10 07:41:55 +000074 """Check the LaTeX formatting in a sequence of lines.
Raymond Hettinger71e00332003-05-10 03:30:13 +000075
76 Opts is a mapping of options to option values if any:
77 -m munge parenthesis and brackets
Raymond Hettinger71e00332003-05-10 03:30:13 +000078 -d delimiters only checking
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000079 -v verbose trace of delimiter matching
Raymond Hettinger71e00332003-05-10 03:30:13 +000080 -s lineno: linenumber to start scan (default is 1).
81
Raymond Hettinger0fd525f2003-05-10 07:41:55 +000082 Morecmds is a sequence of LaTeX commands (without backslashes) that
Raymond Hettinger71e00332003-05-10 03:30:13 +000083 are to be considered valid in the scan.
84 """
85
86 texcmd = re.compile(r'\\[A-Za-z]+')
Raymond Hettinger071b0bc2003-05-14 18:15:55 +000087 falsetexcmd = re.compile(r'\/([A-Za-z]+)') # Mismarked with forward slash
Raymond Hettinger71e00332003-05-10 03:30:13 +000088
Raymond Hettinger7b7acd12004-07-12 13:29:10 +000089 validcmds = set(cmdstr.split())
Raymond Hettinger71e00332003-05-10 03:30:13 +000090 for cmd in morecmds:
91 validcmds.add('\\' + cmd)
92
Raymond Hettinger71e00332003-05-10 03:30:13 +000093 if '-m' in opts:
94 pairmap = {']':'[(', ')':'(['} # Munged openers
95 else:
96 pairmap = {']':'[', ')':'('} # Normal opener for a given closer
Raymond Hettinger7b7acd12004-07-12 13:29:10 +000097 openpunct = set('([') # Set of valid openers
Raymond Hettinger71e00332003-05-10 03:30:13 +000098
99 delimiters = re.compile(r'\\(begin|end){([_a-zA-Z]+)}|([()\[\]])')
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000100 braces = re.compile(r'({)|(})')
Raymond Hettingere41d4c82003-08-25 04:39:55 +0000101 doubledwords = re.compile(r'(\b[A-za-z]+\b) \b\1\b')
Raymond Hettingerc60577e2003-09-08 18:43:46 +0000102 spacingmarkup = re.compile(r'\\(ABC|ASCII|C|Cpp|EOF|infinity|NULL|plusminus|POSIX|UNIX)\s')
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000103
104 openers = [] # Stack of pending open delimiters
105 bracestack = [] # Stack of pending open braces
Raymond Hettinger71e00332003-05-10 03:30:13 +0000106
Raymond Hettinger0fd525f2003-05-10 07:41:55 +0000107 tablestart = re.compile(r'\\begin{(?:long)?table([iv]+)}')
108 tableline = re.compile(r'\\line([iv]+){')
109 tableend = re.compile(r'\\end{(?:long)?table([iv]+)}')
110 tablelevel = ''
111 tablestartline = 0
112
Raymond Hettinger71e00332003-05-10 03:30:13 +0000113 startline = int(opts.get('-s', '1'))
114 lineno = 0
115
116 for lineno, line in izip(count(startline), islice(source, startline-1, None)):
117 line = line.rstrip()
118
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000119 # Check balancing of open/close parenthesis, brackets, and begin/end blocks
Raymond Hettinger71e00332003-05-10 03:30:13 +0000120 for begend, name, punct in delimiters.findall(line):
121 if '-v' in opts:
122 print lineno, '|', begend, name, punct,
123 if begend == 'begin' and '-d' not in opts:
124 openers.append((lineno, name))
125 elif punct in openpunct:
126 openers.append((lineno, punct))
127 elif begend == 'end' and '-d' not in opts:
128 matchclose(lineno, name, openers, pairmap)
129 elif punct in pairmap:
130 matchclose(lineno, punct, openers, pairmap)
131 if '-v' in opts:
132 print ' --> ', openers
133
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000134 # Balance opening and closing braces
135 for open, close in braces.findall(line):
136 if open == '{':
137 bracestack.append(lineno)
138 if close == '}':
139 try:
140 bracestack.pop()
141 except IndexError:
142 print r'Warning, unmatched } on line %s.' % (lineno,)
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000143
144 # Optionally, skip LaTeX specific checks
145 if '-d' in opts:
146 continue
147
148 # Warn whenever forward slashes encountered with a LaTeX command
149 for cmd in falsetexcmd.findall(line):
150 if '822' in line or '.html' in line:
151 continue # Ignore false positives for urls and for /rfc822
152 if '\\' + cmd in validcmds:
153 print 'Warning, forward slash used on line %d with cmd: /%s' % (lineno, cmd)
154
Raymond Hettingerc60577e2003-09-08 18:43:46 +0000155 # Check for markup requiring {} for correct spacing
156 for cmd in spacingmarkup.findall(line):
157 print r'Warning, \%s should be written as \%s{} on line %d' % (cmd, cmd, lineno)
Raymond Hettingerb9c07af2003-09-08 17:33:31 +0000158
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000159 # Validate commands
160 nc = line.find(r'\newcommand')
161 if nc != -1:
162 start = line.find('{', nc)
163 end = line.find('}', start)
164 validcmds.add(line[start+1:end])
165 for cmd in texcmd.findall(line):
166 if cmd not in validcmds:
167 print r'Warning, unknown tex cmd on line %d: \%s' % (lineno, cmd)
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000168
Raymond Hettinger4f0c6b22003-05-10 09:04:37 +0000169 # Check table levels (make sure lineii only inside tableii)
Raymond Hettinger0fd525f2003-05-10 07:41:55 +0000170 m = tablestart.search(line)
171 if m:
172 tablelevel = m.group(1)
173 tablestartline = lineno
174 m = tableline.search(line)
175 if m and m.group(1) != tablelevel:
176 print r'Warning, \line%s on line %d does not match \table%s on line %d' % (m.group(1), lineno, tablelevel, tablestartline)
177 if tableend.search(line):
178 tablelevel = ''
179
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000180 # Style guide warnings
181 if 'e.g.' in line or 'i.e.' in line:
182 print r'Style warning, avoid use of i.e or e.g. on line %d' % (lineno,)
183
Raymond Hettingere41d4c82003-08-25 04:39:55 +0000184 for dw in doubledwords.findall(line):
185 print r'Doubled word warning. "%s" on line %d' % (dw, lineno)
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000186
Raymond Hettinger4f0c6b22003-05-10 09:04:37 +0000187 lastline = lineno
Raymond Hettinger71e00332003-05-10 03:30:13 +0000188 for lineno, symbol in openers:
Raymond Hettinger0fd525f2003-05-10 07:41:55 +0000189 print "Unmatched open delimiter '%s' on line %d" % (symbol, lineno)
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000190 for lineno in bracestack:
191 print "Unmatched { on line %d" % (lineno,)
Raymond Hettinger4f0c6b22003-05-10 09:04:37 +0000192 print 'Done checking %d lines.' % (lastline,)
Raymond Hettinger71e00332003-05-10 03:30:13 +0000193 return 0
194
195def main(args=None):
196 if args is None:
197 args = sys.argv[1:]
Raymond Hettinger071b0bc2003-05-14 18:15:55 +0000198 optitems, arglist = getopt.getopt(args, "k:mdhs:v")
Raymond Hettinger71e00332003-05-10 03:30:13 +0000199 opts = dict(optitems)
200 if '-h' in opts or args==[]:
201 print __doc__
202 return 0
203
204 if len(arglist) < 1:
205 print 'Please specify a file to be checked'
206 return 1
207
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000208 for i, filespec in enumerate(arglist):
209 if '*' in filespec or '?' in filespec:
210 arglist[i:i+1] = glob.glob(filespec)
211
Raymond Hettinger71e00332003-05-10 03:30:13 +0000212 morecmds = [v for k,v in optitems if k=='-k']
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000213 err = []
Raymond Hettinger71e00332003-05-10 03:30:13 +0000214
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000215 for filename in arglist:
216 print '=' * 30
217 print "Checking", filename
218 try:
219 f = open(filename)
220 except IOError:
221 print 'Cannot open file %s.' % arglist[0]
222 return 2
Raymond Hettinger71e00332003-05-10 03:30:13 +0000223
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000224 try:
225 err.append(checkit(f, opts, morecmds))
226 finally:
227 f.close()
228
229 return max(err)
Raymond Hettinger71e00332003-05-10 03:30:13 +0000230
231if __name__ == '__main__':
232 sys.exit(main())