Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 1 | """ TeXcheck.py -- rough syntax checking on Python style LaTeX documents. |
| 2 | |
| 3 | Written by Raymond D. Hettinger <python at rcn.com> |
| 4 | Copyright (c) 2003 Python Software Foundation. All rights reserved. |
| 5 | |
| 6 | Designed to catch common markup errors including: |
| 7 | * Unbalanced or mismatched parenthesis, brackets, and braces. |
| 8 | * Unbalanced of mismatched \begin and \end blocks. |
| 9 | * Misspelled or invalid LaTeX commands. |
| 10 | * Use of forward slashes instead of backslashes for commands. |
Raymond Hettinger | 4f0c6b2 | 2003-05-10 09:04:37 +0000 | [diff] [blame] | 11 | * Table line size mismatches (only \lineii used in a tableii). |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 12 | |
| 13 | Command line usage: |
| 14 | python texcheck.py [-h] [-k keyword] foobar.tex |
| 15 | |
| 16 | Options: |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 17 | -m Munge parenthesis and brackets. [0,n) would normally mismatch. |
| 18 | -k keyword: Keyword is a valid LaTeX command. Do not include the backslash. |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 19 | -f: Forward-slash warnings suppressed. |
| 20 | -d: Delimiter check only (useful for non-LaTeX files). |
| 21 | -h: Help |
| 22 | -s lineno: Start at lineno (useful for skipping complex sections). |
| 23 | -v: Verbose. Shows current delimiter and unclosed delimiters. |
| 24 | """ |
| 25 | |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 26 | import re |
| 27 | import sets |
| 28 | import sys |
| 29 | import getopt |
| 30 | from itertools import izip, count, islice |
| 31 | |
| 32 | cmdstr = r""" |
| 33 | \section \module \declaremodule \modulesynopsis \moduleauthor |
| 34 | \sectionauthor \versionadded \code \class \method \begin |
| 35 | \optional \var \ref \end \subsection \lineiii \hline \label |
| 36 | \indexii \textrm \ldots \keyword \stindex \index \item \note |
| 37 | \withsubitem \ttindex \footnote \citetitle \samp \opindex |
| 38 | \noindent \exception \strong \dfn \ctype \obindex \character |
| 39 | \indexiii \function \bifuncindex \refmodule \refbimodindex |
| 40 | \subsubsection \nodename \member \chapter \emph \ASCII \UNIX |
| 41 | \regexp \program \production \token \productioncont \term |
| 42 | \grammartoken \lineii \seemodule \file \EOF \documentclass |
| 43 | \usepackage \title \input \maketitle \ifhtml \fi \url \Cpp |
| 44 | \tableofcontents \kbd \programopt \envvar \refstmodindex |
| 45 | \cfunction \constant \NULL \moreargs \cfuncline \cdata |
| 46 | \textasciicircum \n \ABC \setindexsubitem \versionchanged |
| 47 | \deprecated \seetext \newcommand \POSIX \pep \warning \rfc |
| 48 | \verbatiminput \methodline \textgreater \seetitle \lineiv |
| 49 | \funclineni \ulink \manpage \funcline \dataline \unspecified |
| 50 | \textbackslash \mimetype \mailheader \seepep \textunderscore |
| 51 | \longprogramopt \infinity \plusminus \shortversion \version |
| 52 | \refmodindex \seerfc \makeindex \makemodindex \renewcommand |
Raymond Hettinger | 0fd525f | 2003-05-10 07:41:55 +0000 | [diff] [blame] | 53 | \indexname \appendix \protect \indexiv \mbox \textasciitilde |
| 54 | \platform \seeurl \leftmargin \labelwidth \localmoduletable |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 55 | \LaTeX \copyright \memberline \backslash \pi \centerline |
| 56 | \caption \vspace \textwidth \menuselection \textless |
| 57 | \makevar \csimplemacro \menuselection \bfcode \sub \release |
| 58 | \email \kwindex \refexmodindex \filenq \e \menuselection |
| 59 | \exindex \linev \newsgroup \verbatim \setshortversion |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 60 | """ |
| 61 | |
| 62 | def matchclose(c_lineno, c_symbol, openers, pairmap): |
| 63 | "Verify that closing delimiter matches most recent opening delimiter" |
| 64 | try: |
| 65 | o_lineno, o_symbol = openers.pop() |
| 66 | except IndexError: |
| 67 | msg = "Delimiter mismatch. On line %d, encountered closing '%s' without corresponding open" % (c_lineno, c_symbol) |
| 68 | raise Exception, msg |
| 69 | if o_symbol in pairmap.get(c_symbol, [c_symbol]): return |
| 70 | msg = "Opener '%s' on line %d was not closed before encountering '%s' on line %d" % (o_symbol, o_lineno, c_symbol, c_lineno) |
| 71 | raise Exception, msg |
| 72 | |
| 73 | def checkit(source, opts, morecmds=[]): |
Raymond Hettinger | 0fd525f | 2003-05-10 07:41:55 +0000 | [diff] [blame] | 74 | """Check the LaTeX formatting in a sequence of lines. |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 75 | |
| 76 | Opts is a mapping of options to option values if any: |
| 77 | -m munge parenthesis and brackets |
| 78 | -f forward slash warnings to be skipped |
| 79 | -d delimiters only checking |
| 80 | -v verbose listing on delimiters |
| 81 | -s lineno: linenumber to start scan (default is 1). |
| 82 | |
Raymond Hettinger | 0fd525f | 2003-05-10 07:41:55 +0000 | [diff] [blame] | 83 | Morecmds is a sequence of LaTeX commands (without backslashes) that |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 84 | are to be considered valid in the scan. |
| 85 | """ |
| 86 | |
| 87 | texcmd = re.compile(r'\\[A-Za-z]+') |
| 88 | |
| 89 | validcmds = sets.Set(cmdstr.split()) |
| 90 | for cmd in morecmds: |
| 91 | validcmds.add('\\' + cmd) |
| 92 | |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 93 | if '-m' in opts: |
| 94 | pairmap = {']':'[(', ')':'(['} # Munged openers |
| 95 | else: |
| 96 | pairmap = {']':'[', ')':'('} # Normal opener for a given closer |
| 97 | openpunct = sets.Set('([') # Set of valid openers |
| 98 | |
| 99 | delimiters = re.compile(r'\\(begin|end){([_a-zA-Z]+)}|([()\[\]])') |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 100 | braces = re.compile(r'({)|(})') |
| 101 | |
| 102 | openers = [] # Stack of pending open delimiters |
| 103 | bracestack = [] # Stack of pending open braces |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 104 | |
Raymond Hettinger | 0fd525f | 2003-05-10 07:41:55 +0000 | [diff] [blame] | 105 | tablestart = re.compile(r'\\begin{(?:long)?table([iv]+)}') |
| 106 | tableline = re.compile(r'\\line([iv]+){') |
| 107 | tableend = re.compile(r'\\end{(?:long)?table([iv]+)}') |
| 108 | tablelevel = '' |
| 109 | tablestartline = 0 |
| 110 | |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 111 | startline = int(opts.get('-s', '1')) |
| 112 | lineno = 0 |
| 113 | |
| 114 | for lineno, line in izip(count(startline), islice(source, startline-1, None)): |
| 115 | line = line.rstrip() |
| 116 | |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 117 | if '/' in line and '-f' not in opts and '-d' not in opts: |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 118 | # Warn whenever forward slashes encountered |
| 119 | line = line.rstrip() |
| 120 | print 'Warning, forward slash on line %d: %s' % (lineno, line) |
| 121 | |
| 122 | if '-d' not in opts: |
| 123 | # Validate commands |
| 124 | nc = line.find(r'\newcommand') |
| 125 | if nc != -1: |
| 126 | start = line.find('{', nc) |
| 127 | end = line.find('}', start) |
| 128 | validcmds.add(line[start+1:end]) |
| 129 | for cmd in texcmd.findall(line): |
| 130 | if cmd not in validcmds: |
| 131 | print r'Warning, unknown tex cmd on line %d: \%s' % (lineno, cmd) |
| 132 | |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 133 | # Check balancing of open/close parenthesis and brackets |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 134 | for begend, name, punct in delimiters.findall(line): |
| 135 | if '-v' in opts: |
| 136 | print lineno, '|', begend, name, punct, |
| 137 | if begend == 'begin' and '-d' not in opts: |
| 138 | openers.append((lineno, name)) |
| 139 | elif punct in openpunct: |
| 140 | openers.append((lineno, punct)) |
| 141 | elif begend == 'end' and '-d' not in opts: |
| 142 | matchclose(lineno, name, openers, pairmap) |
| 143 | elif punct in pairmap: |
| 144 | matchclose(lineno, punct, openers, pairmap) |
| 145 | if '-v' in opts: |
| 146 | print ' --> ', openers |
| 147 | |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 148 | # Balance opening and closing braces |
| 149 | for open, close in braces.findall(line): |
| 150 | if open == '{': |
| 151 | bracestack.append(lineno) |
| 152 | if close == '}': |
| 153 | try: |
| 154 | bracestack.pop() |
| 155 | except IndexError: |
| 156 | print r'Warning, unmatched } on line %s.' % (lineno,) |
| 157 | if '-v' in opts: |
| 158 | print ' --> ', bracestack |
| 159 | |
Raymond Hettinger | 4f0c6b2 | 2003-05-10 09:04:37 +0000 | [diff] [blame] | 160 | # Check table levels (make sure lineii only inside tableii) |
Raymond Hettinger | 0fd525f | 2003-05-10 07:41:55 +0000 | [diff] [blame] | 161 | m = tablestart.search(line) |
| 162 | if m: |
| 163 | tablelevel = m.group(1) |
| 164 | tablestartline = lineno |
| 165 | m = tableline.search(line) |
| 166 | if m and m.group(1) != tablelevel: |
| 167 | print r'Warning, \line%s on line %d does not match \table%s on line %d' % (m.group(1), lineno, tablelevel, tablestartline) |
| 168 | if tableend.search(line): |
| 169 | tablelevel = '' |
| 170 | |
Raymond Hettinger | 4f0c6b2 | 2003-05-10 09:04:37 +0000 | [diff] [blame] | 171 | lastline = lineno |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 172 | for lineno, symbol in openers: |
Raymond Hettinger | 0fd525f | 2003-05-10 07:41:55 +0000 | [diff] [blame] | 173 | print "Unmatched open delimiter '%s' on line %d" % (symbol, lineno) |
Raymond Hettinger | 62aa994 | 2003-05-12 23:33:28 +0000 | [diff] [blame] | 174 | for lineno in bracestack: |
| 175 | print "Unmatched { on line %d" % (lineno,) |
Raymond Hettinger | 4f0c6b2 | 2003-05-10 09:04:37 +0000 | [diff] [blame] | 176 | print 'Done checking %d lines.' % (lastline,) |
Raymond Hettinger | 71e0033 | 2003-05-10 03:30:13 +0000 | [diff] [blame] | 177 | return 0 |
| 178 | |
| 179 | def main(args=None): |
| 180 | if args is None: |
| 181 | args = sys.argv[1:] |
| 182 | optitems, arglist = getopt.getopt(args, "k:mfdhs:v") |
| 183 | opts = dict(optitems) |
| 184 | if '-h' in opts or args==[]: |
| 185 | print __doc__ |
| 186 | return 0 |
| 187 | |
| 188 | if len(arglist) < 1: |
| 189 | print 'Please specify a file to be checked' |
| 190 | return 1 |
| 191 | |
| 192 | morecmds = [v for k,v in optitems if k=='-k'] |
| 193 | |
| 194 | try: |
| 195 | f = open(arglist[0]) |
| 196 | except IOError: |
| 197 | print 'Cannot open file %s.' % arglist[0] |
| 198 | return 2 |
| 199 | |
| 200 | return(checkit(f, opts, morecmds)) |
| 201 | |
| 202 | if __name__ == '__main__': |
| 203 | sys.exit(main()) |
| 204 | |