blob: 4a5663584204b2bdbfd90421a1ee3cb039742164 [file] [log] [blame]
Raymond Hettinger71e00332003-05-10 03:30:13 +00001""" TeXcheck.py -- rough syntax checking on Python style LaTeX documents.
2
3 Written by Raymond D. Hettinger <python at rcn.com>
4 Copyright (c) 2003 Python Software Foundation. All rights reserved.
5
6Designed to catch common markup errors including:
7* Unbalanced or mismatched parenthesis, brackets, and braces.
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +00008* Unbalanced or mismatched \\begin and \\end blocks.
Raymond Hettinger71e00332003-05-10 03:30:13 +00009* Misspelled or invalid LaTeX commands.
10* Use of forward slashes instead of backslashes for commands.
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000011* Table line size mismatches.
Raymond Hettinger71e00332003-05-10 03:30:13 +000012
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000013Sample command line usage:
14 python texcheck.py -k chapterheading -m lib/librandomtex *.tex
Raymond Hettinger71e00332003-05-10 03:30:13 +000015
16Options:
Raymond Hettinger62aa9942003-05-12 23:33:28 +000017 -m Munge parenthesis and brackets. [0,n) would normally mismatch.
18 -k keyword: Keyword is a valid LaTeX command. Do not include the backslash.
Raymond Hettinger71e00332003-05-10 03:30:13 +000019 -d: Delimiter check only (useful for non-LaTeX files).
20 -h: Help
21 -s lineno: Start at lineno (useful for skipping complex sections).
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000022 -v: Verbose. Trace the matching of //begin and //end blocks.
Raymond Hettinger71e00332003-05-10 03:30:13 +000023"""
24
Raymond Hettinger71e00332003-05-10 03:30:13 +000025import re
Raymond Hettinger71e00332003-05-10 03:30:13 +000026import sys
27import getopt
28from itertools import izip, count, islice
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000029import glob
Raymond Hettinger71e00332003-05-10 03:30:13 +000030
31cmdstr = r"""
32 \section \module \declaremodule \modulesynopsis \moduleauthor
33 \sectionauthor \versionadded \code \class \method \begin
34 \optional \var \ref \end \subsection \lineiii \hline \label
35 \indexii \textrm \ldots \keyword \stindex \index \item \note
36 \withsubitem \ttindex \footnote \citetitle \samp \opindex
37 \noindent \exception \strong \dfn \ctype \obindex \character
38 \indexiii \function \bifuncindex \refmodule \refbimodindex
39 \subsubsection \nodename \member \chapter \emph \ASCII \UNIX
40 \regexp \program \production \token \productioncont \term
41 \grammartoken \lineii \seemodule \file \EOF \documentclass
42 \usepackage \title \input \maketitle \ifhtml \fi \url \Cpp
43 \tableofcontents \kbd \programopt \envvar \refstmodindex
44 \cfunction \constant \NULL \moreargs \cfuncline \cdata
45 \textasciicircum \n \ABC \setindexsubitem \versionchanged
46 \deprecated \seetext \newcommand \POSIX \pep \warning \rfc
47 \verbatiminput \methodline \textgreater \seetitle \lineiv
48 \funclineni \ulink \manpage \funcline \dataline \unspecified
49 \textbackslash \mimetype \mailheader \seepep \textunderscore
50 \longprogramopt \infinity \plusminus \shortversion \version
51 \refmodindex \seerfc \makeindex \makemodindex \renewcommand
Raymond Hettinger0fd525f2003-05-10 07:41:55 +000052 \indexname \appendix \protect \indexiv \mbox \textasciitilde
53 \platform \seeurl \leftmargin \labelwidth \localmoduletable
Raymond Hettinger62aa9942003-05-12 23:33:28 +000054 \LaTeX \copyright \memberline \backslash \pi \centerline
55 \caption \vspace \textwidth \menuselection \textless
56 \makevar \csimplemacro \menuselection \bfcode \sub \release
57 \email \kwindex \refexmodindex \filenq \e \menuselection
58 \exindex \linev \newsgroup \verbatim \setshortversion
Raymond Hettinger5492f3d2004-07-12 13:16:49 +000059 \author \authoraddress \paragraph \subparagraph \cmemberline
Georg Brandl87975782006-01-23 21:31:00 +000060 \textbar \C \seelink
Raymond Hettinger71e00332003-05-10 03:30:13 +000061"""
62
63def matchclose(c_lineno, c_symbol, openers, pairmap):
64 "Verify that closing delimiter matches most recent opening delimiter"
65 try:
66 o_lineno, o_symbol = openers.pop()
67 except IndexError:
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000068 print "\nDelimiter mismatch. On line %d, encountered closing '%s' without corresponding open" % (c_lineno, c_symbol)
69 return
Raymond Hettinger71e00332003-05-10 03:30:13 +000070 if o_symbol in pairmap.get(c_symbol, [c_symbol]): return
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000071 print "\nOpener '%s' on line %d was not closed before encountering '%s' on line %d" % (o_symbol, o_lineno, c_symbol, c_lineno)
72 return
Raymond Hettinger71e00332003-05-10 03:30:13 +000073
74def checkit(source, opts, morecmds=[]):
Raymond Hettinger0fd525f2003-05-10 07:41:55 +000075 """Check the LaTeX formatting in a sequence of lines.
Raymond Hettinger71e00332003-05-10 03:30:13 +000076
77 Opts is a mapping of options to option values if any:
78 -m munge parenthesis and brackets
Raymond Hettinger71e00332003-05-10 03:30:13 +000079 -d delimiters only checking
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000080 -v verbose trace of delimiter matching
Raymond Hettinger71e00332003-05-10 03:30:13 +000081 -s lineno: linenumber to start scan (default is 1).
82
Raymond Hettinger0fd525f2003-05-10 07:41:55 +000083 Morecmds is a sequence of LaTeX commands (without backslashes) that
Raymond Hettinger71e00332003-05-10 03:30:13 +000084 are to be considered valid in the scan.
85 """
86
87 texcmd = re.compile(r'\\[A-Za-z]+')
Raymond Hettinger071b0bc2003-05-14 18:15:55 +000088 falsetexcmd = re.compile(r'\/([A-Za-z]+)') # Mismarked with forward slash
Raymond Hettinger71e00332003-05-10 03:30:13 +000089
Raymond Hettinger7b7acd12004-07-12 13:29:10 +000090 validcmds = set(cmdstr.split())
Raymond Hettinger71e00332003-05-10 03:30:13 +000091 for cmd in morecmds:
92 validcmds.add('\\' + cmd)
93
Raymond Hettinger71e00332003-05-10 03:30:13 +000094 if '-m' in opts:
95 pairmap = {']':'[(', ')':'(['} # Munged openers
96 else:
97 pairmap = {']':'[', ')':'('} # Normal opener for a given closer
Raymond Hettinger7b7acd12004-07-12 13:29:10 +000098 openpunct = set('([') # Set of valid openers
Raymond Hettinger71e00332003-05-10 03:30:13 +000099
100 delimiters = re.compile(r'\\(begin|end){([_a-zA-Z]+)}|([()\[\]])')
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000101 braces = re.compile(r'({)|(})')
Raymond Hettingere41d4c82003-08-25 04:39:55 +0000102 doubledwords = re.compile(r'(\b[A-za-z]+\b) \b\1\b')
Raymond Hettingerc60577e2003-09-08 18:43:46 +0000103 spacingmarkup = re.compile(r'\\(ABC|ASCII|C|Cpp|EOF|infinity|NULL|plusminus|POSIX|UNIX)\s')
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000104
105 openers = [] # Stack of pending open delimiters
106 bracestack = [] # Stack of pending open braces
Raymond Hettinger71e00332003-05-10 03:30:13 +0000107
Raymond Hettinger0fd525f2003-05-10 07:41:55 +0000108 tablestart = re.compile(r'\\begin{(?:long)?table([iv]+)}')
109 tableline = re.compile(r'\\line([iv]+){')
110 tableend = re.compile(r'\\end{(?:long)?table([iv]+)}')
111 tablelevel = ''
112 tablestartline = 0
113
Raymond Hettinger71e00332003-05-10 03:30:13 +0000114 startline = int(opts.get('-s', '1'))
115 lineno = 0
116
117 for lineno, line in izip(count(startline), islice(source, startline-1, None)):
118 line = line.rstrip()
119
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000120 # Check balancing of open/close parenthesis, brackets, and begin/end blocks
Raymond Hettinger71e00332003-05-10 03:30:13 +0000121 for begend, name, punct in delimiters.findall(line):
122 if '-v' in opts:
123 print lineno, '|', begend, name, punct,
124 if begend == 'begin' and '-d' not in opts:
125 openers.append((lineno, name))
126 elif punct in openpunct:
127 openers.append((lineno, punct))
128 elif begend == 'end' and '-d' not in opts:
129 matchclose(lineno, name, openers, pairmap)
130 elif punct in pairmap:
131 matchclose(lineno, punct, openers, pairmap)
132 if '-v' in opts:
133 print ' --> ', openers
134
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000135 # Balance opening and closing braces
136 for open, close in braces.findall(line):
137 if open == '{':
138 bracestack.append(lineno)
139 if close == '}':
140 try:
141 bracestack.pop()
142 except IndexError:
143 print r'Warning, unmatched } on line %s.' % (lineno,)
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000144
145 # Optionally, skip LaTeX specific checks
146 if '-d' in opts:
147 continue
148
149 # Warn whenever forward slashes encountered with a LaTeX command
150 for cmd in falsetexcmd.findall(line):
151 if '822' in line or '.html' in line:
152 continue # Ignore false positives for urls and for /rfc822
153 if '\\' + cmd in validcmds:
154 print 'Warning, forward slash used on line %d with cmd: /%s' % (lineno, cmd)
155
Raymond Hettingerc60577e2003-09-08 18:43:46 +0000156 # Check for markup requiring {} for correct spacing
157 for cmd in spacingmarkup.findall(line):
158 print r'Warning, \%s should be written as \%s{} on line %d' % (cmd, cmd, lineno)
Raymond Hettingerb9c07af2003-09-08 17:33:31 +0000159
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000160 # Validate commands
161 nc = line.find(r'\newcommand')
162 if nc != -1:
163 start = line.find('{', nc)
164 end = line.find('}', start)
165 validcmds.add(line[start+1:end])
166 for cmd in texcmd.findall(line):
167 if cmd not in validcmds:
168 print r'Warning, unknown tex cmd on line %d: \%s' % (lineno, cmd)
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000169
Raymond Hettinger4f0c6b22003-05-10 09:04:37 +0000170 # Check table levels (make sure lineii only inside tableii)
Raymond Hettinger0fd525f2003-05-10 07:41:55 +0000171 m = tablestart.search(line)
172 if m:
173 tablelevel = m.group(1)
174 tablestartline = lineno
175 m = tableline.search(line)
176 if m and m.group(1) != tablelevel:
177 print r'Warning, \line%s on line %d does not match \table%s on line %d' % (m.group(1), lineno, tablelevel, tablestartline)
178 if tableend.search(line):
179 tablelevel = ''
180
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000181 # Style guide warnings
182 if 'e.g.' in line or 'i.e.' in line:
183 print r'Style warning, avoid use of i.e or e.g. on line %d' % (lineno,)
184
Raymond Hettingere41d4c82003-08-25 04:39:55 +0000185 for dw in doubledwords.findall(line):
186 print r'Doubled word warning. "%s" on line %d' % (dw, lineno)
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000187
Raymond Hettinger4f0c6b22003-05-10 09:04:37 +0000188 lastline = lineno
Raymond Hettinger71e00332003-05-10 03:30:13 +0000189 for lineno, symbol in openers:
Raymond Hettinger0fd525f2003-05-10 07:41:55 +0000190 print "Unmatched open delimiter '%s' on line %d" % (symbol, lineno)
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000191 for lineno in bracestack:
192 print "Unmatched { on line %d" % (lineno,)
Raymond Hettinger4f0c6b22003-05-10 09:04:37 +0000193 print 'Done checking %d lines.' % (lastline,)
Raymond Hettinger71e00332003-05-10 03:30:13 +0000194 return 0
195
196def main(args=None):
197 if args is None:
198 args = sys.argv[1:]
Raymond Hettinger071b0bc2003-05-14 18:15:55 +0000199 optitems, arglist = getopt.getopt(args, "k:mdhs:v")
Raymond Hettinger71e00332003-05-10 03:30:13 +0000200 opts = dict(optitems)
201 if '-h' in opts or args==[]:
202 print __doc__
203 return 0
204
205 if len(arglist) < 1:
206 print 'Please specify a file to be checked'
207 return 1
208
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000209 for i, filespec in enumerate(arglist):
210 if '*' in filespec or '?' in filespec:
211 arglist[i:i+1] = glob.glob(filespec)
212
Raymond Hettinger71e00332003-05-10 03:30:13 +0000213 morecmds = [v for k,v in optitems if k=='-k']
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000214 err = []
Raymond Hettinger71e00332003-05-10 03:30:13 +0000215
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000216 for filename in arglist:
217 print '=' * 30
218 print "Checking", filename
219 try:
220 f = open(filename)
221 except IOError:
222 print 'Cannot open file %s.' % arglist[0]
223 return 2
Raymond Hettinger71e00332003-05-10 03:30:13 +0000224
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000225 try:
226 err.append(checkit(f, opts, morecmds))
227 finally:
228 f.close()
229
230 return max(err)
Raymond Hettinger71e00332003-05-10 03:30:13 +0000231
232if __name__ == '__main__':
233 sys.exit(main())