blob: 6da437061018690705af3194966da4eb09cbc639 [file] [log] [blame]
Raymond Hettinger71e00332003-05-10 03:30:13 +00001""" TeXcheck.py -- rough syntax checking on Python style LaTeX documents.
2
3 Written by Raymond D. Hettinger <python at rcn.com>
4 Copyright (c) 2003 Python Software Foundation. All rights reserved.
5
6Designed to catch common markup errors including:
7* Unbalanced or mismatched parenthesis, brackets, and braces.
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +00008* Unbalanced or mismatched \\begin and \\end blocks.
Raymond Hettinger71e00332003-05-10 03:30:13 +00009* Misspelled or invalid LaTeX commands.
10* Use of forward slashes instead of backslashes for commands.
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000011* Table line size mismatches.
Raymond Hettinger71e00332003-05-10 03:30:13 +000012
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000013Sample command line usage:
14 python texcheck.py -k chapterheading -m lib/librandomtex *.tex
Raymond Hettinger71e00332003-05-10 03:30:13 +000015
16Options:
Raymond Hettinger62aa9942003-05-12 23:33:28 +000017 -m Munge parenthesis and brackets. [0,n) would normally mismatch.
18 -k keyword: Keyword is a valid LaTeX command. Do not include the backslash.
Raymond Hettinger71e00332003-05-10 03:30:13 +000019 -d: Delimiter check only (useful for non-LaTeX files).
20 -h: Help
21 -s lineno: Start at lineno (useful for skipping complex sections).
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000022 -v: Verbose. Trace the matching of //begin and //end blocks.
Raymond Hettinger71e00332003-05-10 03:30:13 +000023"""
24
Raymond Hettinger71e00332003-05-10 03:30:13 +000025import re
26import sets
27import sys
28import getopt
29from itertools import izip, count, islice
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000030import glob
Raymond Hettinger71e00332003-05-10 03:30:13 +000031
32cmdstr = r"""
33 \section \module \declaremodule \modulesynopsis \moduleauthor
34 \sectionauthor \versionadded \code \class \method \begin
35 \optional \var \ref \end \subsection \lineiii \hline \label
36 \indexii \textrm \ldots \keyword \stindex \index \item \note
37 \withsubitem \ttindex \footnote \citetitle \samp \opindex
38 \noindent \exception \strong \dfn \ctype \obindex \character
39 \indexiii \function \bifuncindex \refmodule \refbimodindex
40 \subsubsection \nodename \member \chapter \emph \ASCII \UNIX
41 \regexp \program \production \token \productioncont \term
42 \grammartoken \lineii \seemodule \file \EOF \documentclass
43 \usepackage \title \input \maketitle \ifhtml \fi \url \Cpp
44 \tableofcontents \kbd \programopt \envvar \refstmodindex
45 \cfunction \constant \NULL \moreargs \cfuncline \cdata
46 \textasciicircum \n \ABC \setindexsubitem \versionchanged
47 \deprecated \seetext \newcommand \POSIX \pep \warning \rfc
48 \verbatiminput \methodline \textgreater \seetitle \lineiv
49 \funclineni \ulink \manpage \funcline \dataline \unspecified
50 \textbackslash \mimetype \mailheader \seepep \textunderscore
51 \longprogramopt \infinity \plusminus \shortversion \version
52 \refmodindex \seerfc \makeindex \makemodindex \renewcommand
Raymond Hettinger0fd525f2003-05-10 07:41:55 +000053 \indexname \appendix \protect \indexiv \mbox \textasciitilde
54 \platform \seeurl \leftmargin \labelwidth \localmoduletable
Raymond Hettinger62aa9942003-05-12 23:33:28 +000055 \LaTeX \copyright \memberline \backslash \pi \centerline
56 \caption \vspace \textwidth \menuselection \textless
57 \makevar \csimplemacro \menuselection \bfcode \sub \release
58 \email \kwindex \refexmodindex \filenq \e \menuselection
59 \exindex \linev \newsgroup \verbatim \setshortversion
Raymond Hettinger71e00332003-05-10 03:30:13 +000060"""
61
62def matchclose(c_lineno, c_symbol, openers, pairmap):
63 "Verify that closing delimiter matches most recent opening delimiter"
64 try:
65 o_lineno, o_symbol = openers.pop()
66 except IndexError:
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000067 print "\nDelimiter mismatch. On line %d, encountered closing '%s' without corresponding open" % (c_lineno, c_symbol)
68 return
Raymond Hettinger71e00332003-05-10 03:30:13 +000069 if o_symbol in pairmap.get(c_symbol, [c_symbol]): return
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000070 print "\nOpener '%s' on line %d was not closed before encountering '%s' on line %d" % (o_symbol, o_lineno, c_symbol, c_lineno)
71 return
Raymond Hettinger71e00332003-05-10 03:30:13 +000072
73def checkit(source, opts, morecmds=[]):
Raymond Hettinger0fd525f2003-05-10 07:41:55 +000074 """Check the LaTeX formatting in a sequence of lines.
Raymond Hettinger71e00332003-05-10 03:30:13 +000075
76 Opts is a mapping of options to option values if any:
77 -m munge parenthesis and brackets
Raymond Hettinger71e00332003-05-10 03:30:13 +000078 -d delimiters only checking
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000079 -v verbose trace of delimiter matching
Raymond Hettinger71e00332003-05-10 03:30:13 +000080 -s lineno: linenumber to start scan (default is 1).
81
Raymond Hettinger0fd525f2003-05-10 07:41:55 +000082 Morecmds is a sequence of LaTeX commands (without backslashes) that
Raymond Hettinger71e00332003-05-10 03:30:13 +000083 are to be considered valid in the scan.
84 """
85
86 texcmd = re.compile(r'\\[A-Za-z]+')
Raymond Hettinger071b0bc2003-05-14 18:15:55 +000087 falsetexcmd = re.compile(r'\/([A-Za-z]+)') # Mismarked with forward slash
Raymond Hettinger71e00332003-05-10 03:30:13 +000088
89 validcmds = sets.Set(cmdstr.split())
90 for cmd in morecmds:
91 validcmds.add('\\' + cmd)
92
Raymond Hettinger71e00332003-05-10 03:30:13 +000093 if '-m' in opts:
94 pairmap = {']':'[(', ')':'(['} # Munged openers
95 else:
96 pairmap = {']':'[', ')':'('} # Normal opener for a given closer
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +000097 openpunct = sets.Set('([') # Set of valid openers
Raymond Hettinger71e00332003-05-10 03:30:13 +000098
99 delimiters = re.compile(r'\\(begin|end){([_a-zA-Z]+)}|([()\[\]])')
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000100 braces = re.compile(r'({)|(})')
101
102 openers = [] # Stack of pending open delimiters
103 bracestack = [] # Stack of pending open braces
Raymond Hettinger71e00332003-05-10 03:30:13 +0000104
Raymond Hettinger0fd525f2003-05-10 07:41:55 +0000105 tablestart = re.compile(r'\\begin{(?:long)?table([iv]+)}')
106 tableline = re.compile(r'\\line([iv]+){')
107 tableend = re.compile(r'\\end{(?:long)?table([iv]+)}')
108 tablelevel = ''
109 tablestartline = 0
110
Raymond Hettinger71e00332003-05-10 03:30:13 +0000111 startline = int(opts.get('-s', '1'))
112 lineno = 0
113
114 for lineno, line in izip(count(startline), islice(source, startline-1, None)):
115 line = line.rstrip()
116
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000117 # Check balancing of open/close parenthesis, brackets, and begin/end blocks
Raymond Hettinger71e00332003-05-10 03:30:13 +0000118 for begend, name, punct in delimiters.findall(line):
119 if '-v' in opts:
120 print lineno, '|', begend, name, punct,
121 if begend == 'begin' and '-d' not in opts:
122 openers.append((lineno, name))
123 elif punct in openpunct:
124 openers.append((lineno, punct))
125 elif begend == 'end' and '-d' not in opts:
126 matchclose(lineno, name, openers, pairmap)
127 elif punct in pairmap:
128 matchclose(lineno, punct, openers, pairmap)
129 if '-v' in opts:
130 print ' --> ', openers
131
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000132 # Balance opening and closing braces
133 for open, close in braces.findall(line):
134 if open == '{':
135 bracestack.append(lineno)
136 if close == '}':
137 try:
138 bracestack.pop()
139 except IndexError:
140 print r'Warning, unmatched } on line %s.' % (lineno,)
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000141
142 # Optionally, skip LaTeX specific checks
143 if '-d' in opts:
144 continue
145
146 # Warn whenever forward slashes encountered with a LaTeX command
147 for cmd in falsetexcmd.findall(line):
148 if '822' in line or '.html' in line:
149 continue # Ignore false positives for urls and for /rfc822
150 if '\\' + cmd in validcmds:
151 print 'Warning, forward slash used on line %d with cmd: /%s' % (lineno, cmd)
152
153 # Validate commands
154 nc = line.find(r'\newcommand')
155 if nc != -1:
156 start = line.find('{', nc)
157 end = line.find('}', start)
158 validcmds.add(line[start+1:end])
159 for cmd in texcmd.findall(line):
160 if cmd not in validcmds:
161 print r'Warning, unknown tex cmd on line %d: \%s' % (lineno, cmd)
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000162
Raymond Hettinger4f0c6b22003-05-10 09:04:37 +0000163 # Check table levels (make sure lineii only inside tableii)
Raymond Hettinger0fd525f2003-05-10 07:41:55 +0000164 m = tablestart.search(line)
165 if m:
166 tablelevel = m.group(1)
167 tablestartline = lineno
168 m = tableline.search(line)
169 if m and m.group(1) != tablelevel:
170 print r'Warning, \line%s on line %d does not match \table%s on line %d' % (m.group(1), lineno, tablelevel, tablestartline)
171 if tableend.search(line):
172 tablelevel = ''
173
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000174 # Style guide warnings
175 if 'e.g.' in line or 'i.e.' in line:
176 print r'Style warning, avoid use of i.e or e.g. on line %d' % (lineno,)
177
178
Raymond Hettinger4f0c6b22003-05-10 09:04:37 +0000179 lastline = lineno
Raymond Hettinger71e00332003-05-10 03:30:13 +0000180 for lineno, symbol in openers:
Raymond Hettinger0fd525f2003-05-10 07:41:55 +0000181 print "Unmatched open delimiter '%s' on line %d" % (symbol, lineno)
Raymond Hettinger62aa9942003-05-12 23:33:28 +0000182 for lineno in bracestack:
183 print "Unmatched { on line %d" % (lineno,)
Raymond Hettinger4f0c6b22003-05-10 09:04:37 +0000184 print 'Done checking %d lines.' % (lastline,)
Raymond Hettinger71e00332003-05-10 03:30:13 +0000185 return 0
186
187def main(args=None):
188 if args is None:
189 args = sys.argv[1:]
Raymond Hettinger071b0bc2003-05-14 18:15:55 +0000190 optitems, arglist = getopt.getopt(args, "k:mdhs:v")
Raymond Hettinger71e00332003-05-10 03:30:13 +0000191 opts = dict(optitems)
192 if '-h' in opts or args==[]:
193 print __doc__
194 return 0
195
196 if len(arglist) < 1:
197 print 'Please specify a file to be checked'
198 return 1
199
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000200 for i, filespec in enumerate(arglist):
201 if '*' in filespec or '?' in filespec:
202 arglist[i:i+1] = glob.glob(filespec)
203
Raymond Hettinger71e00332003-05-10 03:30:13 +0000204 morecmds = [v for k,v in optitems if k=='-k']
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000205 err = []
Raymond Hettinger71e00332003-05-10 03:30:13 +0000206
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000207 for filename in arglist:
208 print '=' * 30
209 print "Checking", filename
210 try:
211 f = open(filename)
212 except IOError:
213 print 'Cannot open file %s.' % arglist[0]
214 return 2
Raymond Hettinger71e00332003-05-10 03:30:13 +0000215
Raymond Hettinger6e0f5e02003-05-16 03:06:39 +0000216 try:
217 err.append(checkit(f, opts, morecmds))
218 finally:
219 f.close()
220
221 return max(err)
Raymond Hettinger71e00332003-05-10 03:30:13 +0000222
223if __name__ == '__main__':
224 sys.exit(main())
225