blob: f15e931b90acd1740cd93e695110300ea5089c6f [file] [log] [blame]
Georg Brandl700cf282009-01-04 10:23:49 +00001"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20 - each line defines a rule; if the rule matches, the issue
21 is ignored.
22 - line number may be empty (that is, nothing between the
23 commas: ",,"). In this case, line numbers are ignored (the
24 rule matches anywhere in the file).
25 - the last field does not have to be a complete line; some
26 surrounding text (never more than a line) is enough for
27 context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
Georg Brandl24710632010-10-06 10:47:20 +000044import os
Georg Brandl700cf282009-01-04 10:23:49 +000045import re
Georg Brandl24710632010-10-06 10:47:20 +000046import csv
47import sys
48
Georg Brandl700cf282009-01-04 10:23:49 +000049from docutils import nodes
Benjamin Peterson1a67f582009-01-08 04:01:00 +000050from sphinx.builders import Builder
Georg Brandl700cf282009-01-04 10:23:49 +000051
52detect_all = re.compile(ur'''
53 ::(?=[^=])| # two :: (but NOT ::=)
54 :[a-zA-Z][a-zA-Z0-9]+| # :foo
55 `| # ` (seldom used by itself)
56 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
57 ''', re.UNICODE | re.VERBOSE).finditer
58
Georg Brandl24710632010-10-06 10:47:20 +000059
Georg Brandl700cf282009-01-04 10:23:49 +000060class Rule:
61 def __init__(self, docname, lineno, issue, line):
Georg Brandl24710632010-10-06 10:47:20 +000062 """A rule for ignoring issues"""
Georg Brandl700cf282009-01-04 10:23:49 +000063 self.docname = docname # document to which this rule applies
64 self.lineno = lineno # line number in the original source;
65 # this rule matches only near that.
66 # None -> don't care
67 self.issue = issue # the markup fragment that triggered this rule
68 self.line = line # text of the container element (single line only)
69
70
Georg Brandl24710632010-10-06 10:47:20 +000071
72class dialect(csv.excel):
73 """Our dialect: uses only linefeed as newline."""
74 lineterminator = '\n'
75
76
Georg Brandl700cf282009-01-04 10:23:49 +000077class CheckSuspiciousMarkupBuilder(Builder):
78 """
Georg Brandl24710632010-10-06 10:47:20 +000079 Checks for possibly invalid markup that may leak into the output.
Georg Brandl700cf282009-01-04 10:23:49 +000080 """
81 name = 'suspicious'
82
83 def init(self):
84 # create output file
85 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
86 open(self.log_file_name, 'w').close()
87 # load database of previously ignored issues
Georg Brandl24710632010-10-06 10:47:20 +000088 self.load_rules(os.path.join(os.path.dirname(__file__),
89 'susp-ignored.csv'))
Georg Brandl700cf282009-01-04 10:23:49 +000090
91 def get_outdated_docs(self):
92 return self.env.found_docs
93
94 def get_target_uri(self, docname, typ=None):
95 return ''
96
97 def prepare_writing(self, docnames):
Georg Brandl24710632010-10-06 10:47:20 +000098 pass
Georg Brandl700cf282009-01-04 10:23:49 +000099
100 def write_doc(self, docname, doctree):
Georg Brandl24710632010-10-06 10:47:20 +0000101 # set when any issue is encountered in this document
102 self.any_issue = False
Georg Brandl700cf282009-01-04 10:23:49 +0000103 self.docname = docname
104 visitor = SuspiciousVisitor(doctree, self)
105 doctree.walk(visitor)
106
107 def finish(self):
108 return
109
110 def check_issue(self, line, lineno, issue):
111 if not self.is_ignored(line, lineno, issue):
112 self.report_issue(line, lineno, issue)
113
114 def is_ignored(self, line, lineno, issue):
Georg Brandl24710632010-10-06 10:47:20 +0000115 """Determine whether this issue should be ignored."""
Georg Brandl700cf282009-01-04 10:23:49 +0000116 docname = self.docname
117 for rule in self.rules:
118 if rule.docname != docname: continue
119 if rule.issue != issue: continue
120 # Both lines must match *exactly*. This is rather strict,
121 # and probably should be improved.
122 # Doing fuzzy matches with levenshtein distance could work,
123 # but that means bringing other libraries...
124 # Ok, relax that requirement: just check if the rule fragment
125 # is contained in the document line
126 if rule.line not in line: continue
127 # Check both line numbers. If they're "near"
128 # this rule matches. (lineno=None means "don't care")
129 if (rule.lineno is not None) and \
130 abs(rule.lineno - lineno) > 5: continue
131 # if it came this far, the rule matched
132 return True
133 return False
134
135 def report_issue(self, text, lineno, issue):
136 if not self.any_issue: self.info()
137 self.any_issue = True
138 self.write_log_entry(lineno, issue, text)
139 self.warn('[%s:%d] "%s" found in "%-.120s"' % (
140 self.docname.encode(sys.getdefaultencoding(),'replace'),
141 lineno,
142 issue.encode(sys.getdefaultencoding(),'replace'),
143 text.strip().encode(sys.getdefaultencoding(),'replace')))
144 self.app.statuscode = 1
145
146 def write_log_entry(self, lineno, issue, text):
147 f = open(self.log_file_name, 'ab')
Georg Brandl24710632010-10-06 10:47:20 +0000148 writer = csv.writer(f, dialect)
Georg Brandl700cf282009-01-04 10:23:49 +0000149 writer.writerow([self.docname.encode('utf-8'),
Georg Brandl24710632010-10-06 10:47:20 +0000150 lineno,
151 issue.encode('utf-8'),
152 text.strip().encode('utf-8')])
Georg Brandl700cf282009-01-04 10:23:49 +0000153 f.close()
154
155 def load_rules(self, filename):
156 """Load database of previously ignored issues.
157
158 A csv file, with exactly the same format as suspicious.csv
159 Fields: document name (normalized), line number, issue, surrounding text
160 """
161 self.info("loading ignore rules... ", nonl=1)
162 self.rules = rules = []
163 try: f = open(filename, 'rb')
164 except IOError: return
165 for i, row in enumerate(csv.reader(f)):
166 if len(row) != 4:
Georg Brandl24710632010-10-06 10:47:20 +0000167 raise ValueError(
168 "wrong format in %s, line %d: %s" % (filename, i+1, row))
Georg Brandl700cf282009-01-04 10:23:49 +0000169 docname, lineno, issue, text = row
170 docname = docname.decode('utf-8')
171 if lineno: lineno = int(lineno)
172 else: lineno = None
173 issue = issue.decode('utf-8')
174 text = text.decode('utf-8')
175 rule = Rule(docname, lineno, issue, text)
176 rules.append(rule)
177 f.close()
178 self.info('done, %d rules loaded' % len(self.rules))
179
180
181def get_lineno(node):
Georg Brandl24710632010-10-06 10:47:20 +0000182 """Obtain line number information for a node."""
Georg Brandl700cf282009-01-04 10:23:49 +0000183 lineno = None
184 while lineno is None and node:
185 node = node.parent
186 lineno = node.line
187 return lineno
188
189
190def extract_line(text, index):
191 """text may be a multiline string; extract
192 only the line containing the given character index.
193
194 >>> extract_line("abc\ndefgh\ni", 6)
195 >>> 'defgh'
196 >>> for i in (0, 2, 3, 4, 10):
197 ... print extract_line("abc\ndefgh\ni", i)
198 abc
199 abc
200 abc
201 defgh
202 defgh
203 i
204 """
205 p = text.rfind('\n', 0, index) + 1
206 q = text.find('\n', index)
Georg Brandl24710632010-10-06 10:47:20 +0000207 if q < 0:
208 q = len(text)
Georg Brandl700cf282009-01-04 10:23:49 +0000209 return text[p:q]
210
211
212class SuspiciousVisitor(nodes.GenericNodeVisitor):
213
214 lastlineno = 0
215
216 def __init__(self, document, builder):
217 nodes.GenericNodeVisitor.__init__(self, document)
218 self.builder = builder
219
220 def default_visit(self, node):
221 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
222 text = node.astext()
223 # lineno seems to go backwards sometimes (?)
224 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
225 seen = set() # don't report the same issue more than only once per line
226 for match in detect_all(text):
Georg Brandl700cf282009-01-04 10:23:49 +0000227 issue = match.group()
228 line = extract_line(text, match.start())
229 if (issue, line) not in seen:
230 self.builder.check_issue(line, lineno, issue)
231 seen.add((issue, line))
232
233 unknown_visit = default_visit
234
235 def visit_document(self, node):
236 self.lastlineno = 0
237
238 def visit_comment(self, node):
239 # ignore comments -- too much false positives.
240 # (although doing this could miss some errors;
241 # there were two sections "commented-out" by mistake
242 # in the Python docs that would not be catched)
243 raise nodes.SkipNode