blob: 68be4511d2533760ba6a02e2bfe1ca16fe8d8b21 [file] [log] [blame]
Raymond Hettinger71e00332003-05-10 03:30:13 +00001""" TeXcheck.py -- rough syntax checking on Python style LaTeX documents.
2
3 Written by Raymond D. Hettinger <python at rcn.com>
4 Copyright (c) 2003 Python Software Foundation. All rights reserved.
5
6Designed to catch common markup errors including:
7* Unbalanced or mismatched parenthesis, brackets, and braces.
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +00008* Unbalanced or mismatched \\begin and \\end blocks.
Raymond Hettinger71e00332003-05-10 03:30:13 +00009* Misspelled or invalid LaTeX commands.
10* Use of forward slashes instead of backslashes for commands.
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000011* Table line size mismatches.
Raymond Hettinger71e00332003-05-10 03:30:13 +000012
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000013Sample command line usage:
14 python texcheck.py -k chapterheading -m lib/librandomtex *.tex
Raymond Hettinger71e00332003-05-10 03:30:13 +000015
16Options:
Raymond Hettinger62aa9942003-05-12 23:33:28 +000017 -m Munge parenthesis and brackets. [0,n) would normally mismatch.
18 -k keyword: Keyword is a valid LaTeX command. Do not include the backslash.
Raymond Hettinger71e00332003-05-10 03:30:13 +000019 -d: Delimiter check only (useful for non-LaTeX files).
20 -h: Help
21 -s lineno: Start at lineno (useful for skipping complex sections).
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000022 -v: Verbose. Trace the matching of //begin and //end blocks.
Raymond Hettinger71e00332003-05-10 03:30:13 +000023"""
24
Raymond Hettinger71e00332003-05-10 03:30:13 +000025import re
26import sets
27import sys
28import getopt
29from itertools import izip, count, islice
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000030import glob
Raymond Hettinger71e00332003-05-10 03:30:13 +000031
32cmdstr = r"""
33 \section \module \declaremodule \modulesynopsis \moduleauthor
34 \sectionauthor \versionadded \code \class \method \begin
35 \optional \var \ref \end \subsection \lineiii \hline \label
36 \indexii \textrm \ldots \keyword \stindex \index \item \note
37 \withsubitem \ttindex \footnote \citetitle \samp \opindex
38 \noindent \exception \strong \dfn \ctype \obindex \character
39 \indexiii \function \bifuncindex \refmodule \refbimodindex
40 \subsubsection \nodename \member \chapter \emph \ASCII \UNIX
41 \regexp \program \production \token \productioncont \term
42 \grammartoken \lineii \seemodule \file \EOF \documentclass
43 \usepackage \title \input \maketitle \ifhtml \fi \url \Cpp
44 \tableofcontents \kbd \programopt \envvar \refstmodindex
45 \cfunction \constant \NULL \moreargs \cfuncline \cdata
46 \textasciicircum \n \ABC \setindexsubitem \versionchanged
47 \deprecated \seetext \newcommand \POSIX \pep \warning \rfc
48 \verbatiminput \methodline \textgreater \seetitle \lineiv
49 \funclineni \ulink \manpage \funcline \dataline \unspecified
50 \textbackslash \mimetype \mailheader \seepep \textunderscore
51 \longprogramopt \infinity \plusminus \shortversion \version
52 \refmodindex \seerfc \makeindex \makemodindex \renewcommand
Raymond Hettinger0fd525f2003-05-10 07:41:55 +000053 \indexname \appendix \protect \indexiv \mbox \textasciitilde
54 \platform \seeurl \leftmargin \labelwidth \localmoduletable
Raymond Hettinger62aa9942003-05-12 23:33:28 +000055 \LaTeX \copyright \memberline \backslash \pi \centerline
56 \caption \vspace \textwidth \menuselection \textless
57 \makevar \csimplemacro \menuselection \bfcode \sub \release
58 \email \kwindex \refexmodindex \filenq \e \menuselection
59 \exindex \linev \newsgroup \verbatim \setshortversion
Raymond Hettinger71e00332003-05-10 03:30:13 +000060"""
61
62def matchclose(c_lineno, c_symbol, openers, pairmap):
63 "Verify that closing delimiter matches most recent opening delimiter"
64 try:
65 o_lineno, o_symbol = openers.pop()
66 except IndexError:
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000067 print "\nDelimiter mismatch. On line %d, encountered closing '%s' without corresponding open" % (c_lineno, c_symbol)
68 return
Raymond Hettinger71e00332003-05-10 03:30:13 +000069 if o_symbol in pairmap.get(c_symbol, [c_symbol]): return
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000070 print "\nOpener '%s' on line %d was not closed before encountering '%s' on line %d" % (o_symbol, o_lineno, c_symbol, c_lineno)
71 return
Raymond Hettinger71e00332003-05-10 03:30:13 +000072
73def checkit(source, opts, morecmds=[]):
Raymond Hettinger0fd525f2003-05-10 07:41:55 +000074 """Check the LaTeX formatting in a sequence of lines.
Raymond Hettinger71e00332003-05-10 03:30:13 +000075
76 Opts is a mapping of options to option values if any:
77 -m munge parenthesis and brackets
Raymond Hettinger71e00332003-05-10 03:30:13 +000078 -d delimiters only checking
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000079 -v verbose trace of delimiter matching
Raymond Hettinger71e00332003-05-10 03:30:13 +000080 -s lineno: linenumber to start scan (default is 1).
81
Raymond Hettinger0fd525f2003-05-10 07:41:55 +000082 Morecmds is a sequence of LaTeX commands (without backslashes) that
Raymond Hettinger71e00332003-05-10 03:30:13 +000083 are to be considered valid in the scan.
84 """
85
86 texcmd = re.compile(r'\\[A-Za-z]+')
Raymond Hettinger071b0bc2003-05-14 18:15:55 +000087 falsetexcmd = re.compile(r'\/([A-Za-z]+)') # Mismarked with forward slash
Raymond Hettinger71e00332003-05-10 03:30:13 +000088
89 validcmds = sets.Set(cmdstr.split())
90 for cmd in morecmds:
91 validcmds.add('\\' + cmd)
92
Raymond Hettinger71e00332003-05-10 03:30:13 +000093 if '-m' in opts:
94 pairmap = {']':'[(', ')':'(['} # Munged openers
95 else:
96 pairmap = {']':'[', ')':'('} # Normal opener for a given closer
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000097 openpunct = sets.Set('([') # Set of valid openers
Raymond Hettinger71e00332003-05-10 03:30:13 +000098
99 delimiters = re.compile(r'\\(begin|end){([_a-zA-Z]+)}|([()\[\]])')
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000100 braces = re.compile(r'({)|(})')
Raymond Hettingere41d4c82003-08-25 04:39:55 +0000101 doubledwords = re.compile(r'(\b[A-za-z]+\b) \b\1\b')
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000102
103 openers = [] # Stack of pending open delimiters
104 bracestack = [] # Stack of pending open braces
Raymond Hettinger71e00332003-05-10 03:30:13 +0000105
Raymond Hettinger0fd525f2003-05-10 07:41:55 +0000106 tablestart = re.compile(r'\\begin{(?:long)?table([iv]+)}')
107 tableline = re.compile(r'\\line([iv]+){')
108 tableend = re.compile(r'\\end{(?:long)?table([iv]+)}')
109 tablelevel = ''
110 tablestartline = 0
111
Raymond Hettinger71e00332003-05-10 03:30:13 +0000112 startline = int(opts.get('-s', '1'))
113 lineno = 0
114
115 for lineno, line in izip(count(startline), islice(source, startline-1, None)):
116 line = line.rstrip()
117
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000118 # Check balancing of open/close parenthesis, brackets, and begin/end blocks
Raymond Hettinger71e00332003-05-10 03:30:13 +0000119 for begend, name, punct in delimiters.findall(line):
120 if '-v' in opts:
121 print lineno, '|', begend, name, punct,
122 if begend == 'begin' and '-d' not in opts:
123 openers.append((lineno, name))
124 elif punct in openpunct:
125 openers.append((lineno, punct))
126 elif begend == 'end' and '-d' not in opts:
127 matchclose(lineno, name, openers, pairmap)
128 elif punct in pairmap:
129 matchclose(lineno, punct, openers, pairmap)
130 if '-v' in opts:
131 print ' --> ', openers
132
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000133 # Balance opening and closing braces
134 for open, close in braces.findall(line):
135 if open == '{':
136 bracestack.append(lineno)
137 if close == '}':
138 try:
139 bracestack.pop()
140 except IndexError:
141 print r'Warning, unmatched } on line %s.' % (lineno,)
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000142
143 # Optionally, skip LaTeX specific checks
144 if '-d' in opts:
145 continue
146
147 # Warn whenever forward slashes encountered with a LaTeX command
148 for cmd in falsetexcmd.findall(line):
149 if '822' in line or '.html' in line:
150 continue # Ignore false positives for urls and for /rfc822
151 if '\\' + cmd in validcmds:
152 print 'Warning, forward slash used on line %d with cmd: /%s' % (lineno, cmd)
153
154 # Validate commands
155 nc = line.find(r'\newcommand')
156 if nc != -1:
157 start = line.find('{', nc)
158 end = line.find('}', start)
159 validcmds.add(line[start+1:end])
160 for cmd in texcmd.findall(line):
161 if cmd not in validcmds:
162 print r'Warning, unknown tex cmd on line %d: \%s' % (lineno, cmd)
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000163
Raymond Hettinger4f0c6b22003-05-10 09:04:37 +0000164 # Check table levels (make sure lineii only inside tableii)
Raymond Hettinger0fd525f2003-05-10 07:41:55 +0000165 m = tablestart.search(line)
166 if m:
167 tablelevel = m.group(1)
168 tablestartline = lineno
169 m = tableline.search(line)
170 if m and m.group(1) != tablelevel:
171 print r'Warning, \line%s on line %d does not match \table%s on line %d' % (m.group(1), lineno, tablelevel, tablestartline)
172 if tableend.search(line):
173 tablelevel = ''
174
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000175 # Style guide warnings
176 if 'e.g.' in line or 'i.e.' in line:
177 print r'Style warning, avoid use of i.e or e.g. on line %d' % (lineno,)
178
Raymond Hettingere41d4c82003-08-25 04:39:55 +0000179 for dw in doubledwords.findall(line):
180 print r'Doubled word warning. "%s" on line %d' % (dw, lineno)
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000181
Raymond Hettinger4f0c6b22003-05-10 09:04:37 +0000182 lastline = lineno
Raymond Hettinger71e00332003-05-10 03:30:13 +0000183 for lineno, symbol in openers:
Raymond Hettinger0fd525f2003-05-10 07:41:55 +0000184 print "Unmatched open delimiter '%s' on line %d" % (symbol, lineno)
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000185 for lineno in bracestack:
186 print "Unmatched { on line %d" % (lineno,)
Raymond Hettinger4f0c6b22003-05-10 09:04:37 +0000187 print 'Done checking %d lines.' % (lastline,)
Raymond Hettinger71e00332003-05-10 03:30:13 +0000188 return 0
189
190def main(args=None):
191 if args is None:
192 args = sys.argv[1:]
Raymond Hettinger071b0bc2003-05-14 18:15:55 +0000193 optitems, arglist = getopt.getopt(args, "k:mdhs:v")
Raymond Hettinger71e00332003-05-10 03:30:13 +0000194 opts = dict(optitems)
195 if '-h' in opts or args==[]:
196 print __doc__
197 return 0
198
199 if len(arglist) < 1:
200 print 'Please specify a file to be checked'
201 return 1
202
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000203 for i, filespec in enumerate(arglist):
204 if '*' in filespec or '?' in filespec:
205 arglist[i:i+1] = glob.glob(filespec)
206
Raymond Hettinger71e00332003-05-10 03:30:13 +0000207 morecmds = [v for k,v in optitems if k=='-k']
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000208 err = []
Raymond Hettinger71e00332003-05-10 03:30:13 +0000209
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000210 for filename in arglist:
211 print '=' * 30
212 print "Checking", filename
213 try:
214 f = open(filename)
215 except IOError:
216 print 'Cannot open file %s.' % arglist[0]
217 return 2
Raymond Hettinger71e00332003-05-10 03:30:13 +0000218
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000219 try:
220 err.append(checkit(f, opts, morecmds))
221 finally:
222 f.close()
223
224 return max(err)
Raymond Hettinger71e00332003-05-10 03:30:13 +0000225
226if __name__ == '__main__':
227 sys.exit(main())
228