Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 1 | """ TeXcheck.py -- rough syntax checking on Python style LaTeX documents. |
| 2 | |
| 3 | Written by Raymond D. Hettinger <python at rcn.com> |
| 4 | Copyright (c) 2003 Python Software Foundation. All rights reserved. |
| 5 | |
| 6 | Designed to catch common markup errors including: |
| 7 | * Unbalanced or mismatched parenthesis, brackets, and braces. |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 8 | * Unbalanced or mismatched \\begin and \\end blocks. |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 9 | * Misspelled or invalid LaTeX commands. |
| 10 | * Use of forward slashes instead of backslashes for commands. |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 11 | * Table line size mismatches. |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 12 | |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 13 | Sample command line usage: |
| 14 | python texcheck.py -k chapterheading -m lib/librandomtex *.tex |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 15 | |
| 16 | Options: |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 17 | -m Munge parenthesis and brackets. [0,n) would normally mismatch. |
| 18 | -k keyword: Keyword is a valid LaTeX command. Do not include the backslash. |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 19 | -d: Delimiter check only (useful for non-LaTeX files). |
| 20 | -h: Help |
| 21 | -s lineno: Start at lineno (useful for skipping complex sections). |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 22 | -v: Verbose. Trace the matching of //begin and //end blocks. |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 23 | """ |
| 24 | |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 25 | import re |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 26 | import sys |
| 27 | import getopt |
| 28 | from itertools import izip, count, islice |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 29 | import glob |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 30 | |
| 31 | cmdstr = r""" |
| 32 | \section \module \declaremodule \modulesynopsis \moduleauthor |
| 33 | \sectionauthor \versionadded \code \class \method \begin |
| 34 | \optional \var \ref \end \subsection \lineiii \hline \label |
| 35 | \indexii \textrm \ldots \keyword \stindex \index \item \note |
| 36 | \withsubitem \ttindex \footnote \citetitle \samp \opindex |
| 37 | \noindent \exception \strong \dfn \ctype \obindex \character |
| 38 | \indexiii \function \bifuncindex \refmodule \refbimodindex |
| 39 | \subsubsection \nodename \member \chapter \emph \ASCII \UNIX |
| 40 | \regexp \program \production \token \productioncont \term |
| 41 | \grammartoken \lineii \seemodule \file \EOF \documentclass |
| 42 | \usepackage \title \input \maketitle \ifhtml \fi \url \Cpp |
| 43 | \tableofcontents \kbd \programopt \envvar \refstmodindex |
| 44 | \cfunction \constant \NULL \moreargs \cfuncline \cdata |
| 45 | \textasciicircum \n \ABC \setindexsubitem \versionchanged |
| 46 | \deprecated \seetext \newcommand \POSIX \pep \warning \rfc |
| 47 | \verbatiminput \methodline \textgreater \seetitle \lineiv |
| 48 | \funclineni \ulink \manpage \funcline \dataline \unspecified |
| 49 | \textbackslash \mimetype \mailheader \seepep \textunderscore |
| 50 | \longprogramopt \infinity \plusminus \shortversion \version |
| 51 | \refmodindex \seerfc \makeindex \makemodindex \renewcommand |
Raymond Hettinger | 0fd525f | 2003-05-10 07:41:55 +0000 | [diff] [blame] | 52 | \indexname \appendix \protect \indexiv \mbox \textasciitilde |
| 53 | \platform \seeurl \leftmargin \labelwidth \localmoduletable |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 54 | \LaTeX \copyright \memberline \backslash \pi \centerline |
| 55 | \caption \vspace \textwidth \menuselection \textless |
| 56 | \makevar \csimplemacro \menuselection \bfcode \sub \release |
| 57 | \email \kwindex \refexmodindex \filenq \e \menuselection |
| 58 | \exindex \linev \newsgroup \verbatim \setshortversion |
Raymond Hettinger | 5492f3d | 2004-07-12 13:16:49 +0000 | [diff] [blame] | 59 | \author \authoraddress \paragraph \subparagraph \cmemberline |
Georg Brandl | 8797578 | 2006-01-23 21:31:00 +0000 | [diff] [blame] | 60 | \textbar \C \seelink |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 61 | """ |
| 62 | |
| 63 | def matchclose(c_lineno, c_symbol, openers, pairmap): |
| 64 | "Verify that closing delimiter matches most recent opening delimiter" |
| 65 | try: |
| 66 | o_lineno, o_symbol = openers.pop() |
| 67 | except IndexError: |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 68 | print "\nDelimiter mismatch. On line %d, encountered closing '%s' without corresponding open" % (c_lineno, c_symbol) |
| 69 | return |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 70 | if o_symbol in pairmap.get(c_symbol, [c_symbol]): return |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 71 | print "\nOpener '%s' on line %d was not closed before encountering '%s' on line %d" % (o_symbol, o_lineno, c_symbol, c_lineno) |
| 72 | return |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 73 | |
| 74 | def checkit(source, opts, morecmds=[]): |
Raymond Hettinger | 0fd525f | 2003-05-10 07:41:55 +0000 | [diff] [blame] | 75 | """Check the LaTeX formatting in a sequence of lines. |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 76 | |
| 77 | Opts is a mapping of options to option values if any: |
| 78 | -m munge parenthesis and brackets |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 79 | -d delimiters only checking |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 80 | -v verbose trace of delimiter matching |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 81 | -s lineno: linenumber to start scan (default is 1). |
| 82 | |
Raymond Hettinger | 0fd525f | 2003-05-10 07:41:55 +0000 | [diff] [blame] | 83 | Morecmds is a sequence of LaTeX commands (without backslashes) that |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 84 | are to be considered valid in the scan. |
| 85 | """ |
| 86 | |
| 87 | texcmd = re.compile(r'\\[A-Za-z]+') |
Raymond Hettinger | 071b0bc | 2003-05-14 18:15:55 +0000 | [diff] [blame] | 88 | falsetexcmd = re.compile(r'\/([A-Za-z]+)') # Mismarked with forward slash |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 89 | |
Raymond Hettinger | 7b7acd1 | 2004-07-12 13:29:10 +0000 | [diff] [blame] | 90 | validcmds = set(cmdstr.split()) |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 91 | for cmd in morecmds: |
| 92 | validcmds.add('\\' + cmd) |
| 93 | |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 94 | if '-m' in opts: |
| 95 | pairmap = {']':'[(', ')':'(['} # Munged openers |
| 96 | else: |
| 97 | pairmap = {']':'[', ')':'('} # Normal opener for a given closer |
Raymond Hettinger | 7b7acd1 | 2004-07-12 13:29:10 +0000 | [diff] [blame] | 98 | openpunct = set('([') # Set of valid openers |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 99 | |
| 100 | delimiters = re.compile(r'\\(begin|end){([_a-zA-Z]+)}|([()\[\]])') |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 101 | braces = re.compile(r'({)|(})') |
Raymond Hettinger | e41d4c8 | 2003-08-25 04:39:55 +0000 | [diff] [blame] | 102 | doubledwords = re.compile(r'(\b[A-za-z]+\b) \b\1\b') |
Raymond Hettinger | c60577e | 2003-09-08 18:43:46 +0000 | [diff] [blame] | 103 | spacingmarkup = re.compile(r'\\(ABC|ASCII|C|Cpp|EOF|infinity|NULL|plusminus|POSIX|UNIX)\s') |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 104 | |
| 105 | openers = [] # Stack of pending open delimiters |
| 106 | bracestack = [] # Stack of pending open braces |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 107 | |
Raymond Hettinger | 0fd525f | 2003-05-10 07:41:55 +0000 | [diff] [blame] | 108 | tablestart = re.compile(r'\\begin{(?:long)?table([iv]+)}') |
| 109 | tableline = re.compile(r'\\line([iv]+){') |
| 110 | tableend = re.compile(r'\\end{(?:long)?table([iv]+)}') |
| 111 | tablelevel = '' |
| 112 | tablestartline = 0 |
| 113 | |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 114 | startline = int(opts.get('-s', '1')) |
| 115 | lineno = 0 |
| 116 | |
| 117 | for lineno, line in izip(count(startline), islice(source, startline-1, None)): |
| 118 | line = line.rstrip() |
| 119 | |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 120 | # Check balancing of open/close parenthesis, brackets, and begin/end blocks |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 121 | for begend, name, punct in delimiters.findall(line): |
| 122 | if '-v' in opts: |
| 123 | print lineno, '|', begend, name, punct, |
| 124 | if begend == 'begin' and '-d' not in opts: |
| 125 | openers.append((lineno, name)) |
| 126 | elif punct in openpunct: |
| 127 | openers.append((lineno, punct)) |
| 128 | elif begend == 'end' and '-d' not in opts: |
| 129 | matchclose(lineno, name, openers, pairmap) |
| 130 | elif punct in pairmap: |
| 131 | matchclose(lineno, punct, openers, pairmap) |
| 132 | if '-v' in opts: |
| 133 | print ' --> ', openers |
| 134 | |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 135 | # Balance opening and closing braces |
| 136 | for open, close in braces.findall(line): |
| 137 | if open == '{': |
| 138 | bracestack.append(lineno) |
| 139 | if close == '}': |
| 140 | try: |
| 141 | bracestack.pop() |
| 142 | except IndexError: |
| 143 | print r'Warning, unmatched } on line %s.' % (lineno,) |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 144 | |
| 145 | # Optionally, skip LaTeX specific checks |
| 146 | if '-d' in opts: |
| 147 | continue |
| 148 | |
| 149 | # Warn whenever forward slashes encountered with a LaTeX command |
| 150 | for cmd in falsetexcmd.findall(line): |
| 151 | if '822' in line or '.html' in line: |
| 152 | continue # Ignore false positives for urls and for /rfc822 |
| 153 | if '\\' + cmd in validcmds: |
| 154 | print 'Warning, forward slash used on line %d with cmd: /%s' % (lineno, cmd) |
| 155 | |
Raymond Hettinger | c60577e | 2003-09-08 18:43:46 +0000 | [diff] [blame] | 156 | # Check for markup requiring {} for correct spacing |
| 157 | for cmd in spacingmarkup.findall(line): |
| 158 | print r'Warning, \%s should be written as \%s{} on line %d' % (cmd, cmd, lineno) |
Raymond Hettinger | b9c07af | 2003-09-08 17:33:31 +0000 | [diff] [blame] | 159 | |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 160 | # Validate commands |
| 161 | nc = line.find(r'\newcommand') |
| 162 | if nc != -1: |
| 163 | start = line.find('{', nc) |
| 164 | end = line.find('}', start) |
| 165 | validcmds.add(line[start+1:end]) |
| 166 | for cmd in texcmd.findall(line): |
| 167 | if cmd not in validcmds: |
| 168 | print r'Warning, unknown tex cmd on line %d: \%s' % (lineno, cmd) |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 169 | |
Raymond Hettinger | 4f0c6b2 | 2003-05-10 09:04:37 +0000 | [diff] [blame] | 170 | # Check table levels (make sure lineii only inside tableii) |
Raymond Hettinger | 0fd525f | 2003-05-10 07:41:55 +0000 | [diff] [blame] | 171 | m = tablestart.search(line) |
| 172 | if m: |
| 173 | tablelevel = m.group(1) |
| 174 | tablestartline = lineno |
| 175 | m = tableline.search(line) |
| 176 | if m and m.group(1) != tablelevel: |
| 177 | print r'Warning, \line%s on line %d does not match \table%s on line %d' % (m.group(1), lineno, tablelevel, tablestartline) |
| 178 | if tableend.search(line): |
| 179 | tablelevel = '' |
| 180 | |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 181 | # Style guide warnings |
| 182 | if 'e.g.' in line or 'i.e.' in line: |
| 183 | print r'Style warning, avoid use of i.e or e.g. on line %d' % (lineno,) |
| 184 | |
Raymond Hettinger | e41d4c8 | 2003-08-25 04:39:55 +0000 | [diff] [blame] | 185 | for dw in doubledwords.findall(line): |
| 186 | print r'Doubled word warning. "%s" on line %d' % (dw, lineno) |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 187 | |
Raymond Hettinger | 4f0c6b2 | 2003-05-10 09:04:37 +0000 | [diff] [blame] | 188 | lastline = lineno |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 189 | for lineno, symbol in openers: |
Raymond Hettinger | 0fd525f | 2003-05-10 07:41:55 +0000 | [diff] [blame] | 190 | print "Unmatched open delimiter '%s' on line %d" % (symbol, lineno) |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 191 | for lineno in bracestack: |
| 192 | print "Unmatched { on line %d" % (lineno,) |
Raymond Hettinger | 4f0c6b2 | 2003-05-10 09:04:37 +0000 | [diff] [blame] | 193 | print 'Done checking %d lines.' % (lastline,) |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 194 | return 0 |
| 195 | |
| 196 | def main(args=None): |
| 197 | if args is None: |
| 198 | args = sys.argv[1:] |
Raymond Hettinger | 071b0bc | 2003-05-14 18:15:55 +0000 | [diff] [blame] | 199 | optitems, arglist = getopt.getopt(args, "k:mdhs:v") |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 200 | opts = dict(optitems) |
| 201 | if '-h' in opts or args==[]: |
| 202 | print __doc__ |
| 203 | return 0 |
| 204 | |
| 205 | if len(arglist) < 1: |
| 206 | print 'Please specify a file to be checked' |
| 207 | return 1 |
| 208 | |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 209 | for i, filespec in enumerate(arglist): |
| 210 | if '*' in filespec or '?' in filespec: |
| 211 | arglist[i:i+1] = glob.glob(filespec) |
| 212 | |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 213 | morecmds = [v for k,v in optitems if k=='-k'] |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 214 | err = [] |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 215 | |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 216 | for filename in arglist: |
| 217 | print '=' * 30 |
| 218 | print "Checking", filename |
| 219 | try: |
| 220 | f = open(filename) |
| 221 | except IOError: |
| 222 | print 'Cannot open file %s.' % arglist[0] |
| 223 | return 2 |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 224 | |
Raymond Hettinger | 6e0f5e0 | 2003-05-16 03:06:39 +0000 | [diff] [blame] | 225 | try: |
| 226 | err.append(checkit(f, opts, morecmds)) |
| 227 | finally: |
| 228 | f.close() |
| 229 | |
| 230 | return max(err) |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 231 | |
| 232 | if __name__ == '__main__': |
| 233 | sys.exit(main()) |