blob: d3ed849157f041a1febab6b0710e454a84cc4645 [file] [log] [blame]
Georg Brandl700cf282009-01-04 10:23:49 +00001"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20 - each line defines a rule; if the rule matches, the issue
21 is ignored.
22 - line number may be empty (that is, nothing between the
23 commas: ",,"). In this case, line numbers are ignored (the
24 rule matches anywhere in the file).
25 - the last field does not have to be a complete line; some
26 surrounding text (never more than a line) is enough for
27 context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
Georg Brandl24710632010-10-06 10:47:20 +000044import os
Georg Brandl700cf282009-01-04 10:23:49 +000045import re
Georg Brandl24710632010-10-06 10:47:20 +000046import csv
47import sys
48
Georg Brandl700cf282009-01-04 10:23:49 +000049from docutils import nodes
Benjamin Peterson1a67f582009-01-08 04:01:00 +000050from sphinx.builders import Builder
Georg Brandl700cf282009-01-04 10:23:49 +000051
Georg Brandl14b5a4d2014-10-02 08:26:26 +020052detect_all = re.compile(r'''
Georg Brandl700cf282009-01-04 10:23:49 +000053 ::(?=[^=])| # two :: (but NOT ::=)
54 :[a-zA-Z][a-zA-Z0-9]+| # :foo
55 `| # ` (seldom used by itself)
56 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
57 ''', re.UNICODE | re.VERBOSE).finditer
58
Georg Brandl14b5a4d2014-10-02 08:26:26 +020059py3 = sys.version_info >= (3, 0)
60
Georg Brandl24710632010-10-06 10:47:20 +000061
Georg Brandl700cf282009-01-04 10:23:49 +000062class Rule:
63 def __init__(self, docname, lineno, issue, line):
Georg Brandl24710632010-10-06 10:47:20 +000064 """A rule for ignoring issues"""
Georg Brandl700cf282009-01-04 10:23:49 +000065 self.docname = docname # document to which this rule applies
66 self.lineno = lineno # line number in the original source;
67 # this rule matches only near that.
68 # None -> don't care
69 self.issue = issue # the markup fragment that triggered this rule
70 self.line = line # text of the container element (single line only)
Ezio Melotti144c2692013-03-28 18:01:11 +020071 self.used = False
72
73 def __repr__(self):
74 return '{0.docname},,{0.issue},{0.line}'.format(self)
Georg Brandl700cf282009-01-04 10:23:49 +000075
76
Georg Brandl24710632010-10-06 10:47:20 +000077
78class dialect(csv.excel):
79 """Our dialect: uses only linefeed as newline."""
80 lineterminator = '\n'
81
82
Georg Brandl700cf282009-01-04 10:23:49 +000083class CheckSuspiciousMarkupBuilder(Builder):
84 """
Georg Brandl24710632010-10-06 10:47:20 +000085 Checks for possibly invalid markup that may leak into the output.
Georg Brandl700cf282009-01-04 10:23:49 +000086 """
87 name = 'suspicious'
88
89 def init(self):
90 # create output file
91 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
92 open(self.log_file_name, 'w').close()
93 # load database of previously ignored issues
Georg Brandl160cbce2014-10-29 08:41:02 +010094 self.load_rules(os.path.join(os.path.dirname(__file__), '..',
Georg Brandl24710632010-10-06 10:47:20 +000095 'susp-ignored.csv'))
Georg Brandl700cf282009-01-04 10:23:49 +000096
97 def get_outdated_docs(self):
98 return self.env.found_docs
99
100 def get_target_uri(self, docname, typ=None):
101 return ''
102
103 def prepare_writing(self, docnames):
Georg Brandl24710632010-10-06 10:47:20 +0000104 pass
Georg Brandl700cf282009-01-04 10:23:49 +0000105
106 def write_doc(self, docname, doctree):
Georg Brandl24710632010-10-06 10:47:20 +0000107 # set when any issue is encountered in this document
108 self.any_issue = False
Georg Brandl700cf282009-01-04 10:23:49 +0000109 self.docname = docname
110 visitor = SuspiciousVisitor(doctree, self)
111 doctree.walk(visitor)
112
113 def finish(self):
Ezio Melotti144c2692013-03-28 18:01:11 +0200114 unused_rules = [rule for rule in self.rules if not rule.used]
115 if unused_rules:
116 self.warn('Found %s/%s unused rules:' %
117 (len(unused_rules), len(self.rules)))
118 for rule in unused_rules:
119 self.info(repr(rule))
Georg Brandl700cf282009-01-04 10:23:49 +0000120 return
121
122 def check_issue(self, line, lineno, issue):
123 if not self.is_ignored(line, lineno, issue):
124 self.report_issue(line, lineno, issue)
125
126 def is_ignored(self, line, lineno, issue):
Georg Brandl24710632010-10-06 10:47:20 +0000127 """Determine whether this issue should be ignored."""
Georg Brandl700cf282009-01-04 10:23:49 +0000128 docname = self.docname
129 for rule in self.rules:
130 if rule.docname != docname: continue
131 if rule.issue != issue: continue
132 # Both lines must match *exactly*. This is rather strict,
133 # and probably should be improved.
134 # Doing fuzzy matches with levenshtein distance could work,
135 # but that means bringing other libraries...
136 # Ok, relax that requirement: just check if the rule fragment
137 # is contained in the document line
138 if rule.line not in line: continue
139 # Check both line numbers. If they're "near"
140 # this rule matches. (lineno=None means "don't care")
141 if (rule.lineno is not None) and \
142 abs(rule.lineno - lineno) > 5: continue
143 # if it came this far, the rule matched
Ezio Melotti144c2692013-03-28 18:01:11 +0200144 rule.used = True
Georg Brandl700cf282009-01-04 10:23:49 +0000145 return True
146 return False
147
148 def report_issue(self, text, lineno, issue):
149 if not self.any_issue: self.info()
150 self.any_issue = True
151 self.write_log_entry(lineno, issue, text)
Georg Brandl14b5a4d2014-10-02 08:26:26 +0200152 if py3:
153 self.warn('[%s:%d] "%s" found in "%-.120s"' %
154 (self.docname, lineno, issue, text))
155 else:
156 self.warn('[%s:%d] "%s" found in "%-.120s"' % (
Georg Brandl700cf282009-01-04 10:23:49 +0000157 self.docname.encode(sys.getdefaultencoding(),'replace'),
158 lineno,
159 issue.encode(sys.getdefaultencoding(),'replace'),
160 text.strip().encode(sys.getdefaultencoding(),'replace')))
161 self.app.statuscode = 1
162
163 def write_log_entry(self, lineno, issue, text):
Georg Brandl14b5a4d2014-10-02 08:26:26 +0200164 if py3:
165 f = open(self.log_file_name, 'a')
166 writer = csv.writer(f, dialect)
167 writer.writerow([self.docname, lineno, issue, text.strip()])
168 f.close()
169 else:
170 f = open(self.log_file_name, 'ab')
171 writer = csv.writer(f, dialect)
172 writer.writerow([self.docname.encode('utf-8'),
173 lineno,
174 issue.encode('utf-8'),
175 text.strip().encode('utf-8')])
176 f.close()
Georg Brandl700cf282009-01-04 10:23:49 +0000177
178 def load_rules(self, filename):
179 """Load database of previously ignored issues.
180
181 A csv file, with exactly the same format as suspicious.csv
182 Fields: document name (normalized), line number, issue, surrounding text
183 """
184 self.info("loading ignore rules... ", nonl=1)
185 self.rules = rules = []
Georg Brandl14b5a4d2014-10-02 08:26:26 +0200186 try:
187 if py3:
188 f = open(filename, 'r')
189 else:
190 f = open(filename, 'rb')
191 except IOError:
192 return
Georg Brandl700cf282009-01-04 10:23:49 +0000193 for i, row in enumerate(csv.reader(f)):
194 if len(row) != 4:
Georg Brandl24710632010-10-06 10:47:20 +0000195 raise ValueError(
196 "wrong format in %s, line %d: %s" % (filename, i+1, row))
Georg Brandl700cf282009-01-04 10:23:49 +0000197 docname, lineno, issue, text = row
Georg Brandl14b5a4d2014-10-02 08:26:26 +0200198 if lineno:
199 lineno = int(lineno)
200 else:
201 lineno = None
202 if not py3:
203 docname = docname.decode('utf-8')
204 issue = issue.decode('utf-8')
205 text = text.decode('utf-8')
Georg Brandl700cf282009-01-04 10:23:49 +0000206 rule = Rule(docname, lineno, issue, text)
207 rules.append(rule)
208 f.close()
209 self.info('done, %d rules loaded' % len(self.rules))
210
211
212def get_lineno(node):
Georg Brandl24710632010-10-06 10:47:20 +0000213 """Obtain line number information for a node."""
Georg Brandl700cf282009-01-04 10:23:49 +0000214 lineno = None
215 while lineno is None and node:
216 node = node.parent
217 lineno = node.line
218 return lineno
219
220
221def extract_line(text, index):
222 """text may be a multiline string; extract
223 only the line containing the given character index.
224
225 >>> extract_line("abc\ndefgh\ni", 6)
226 >>> 'defgh'
227 >>> for i in (0, 2, 3, 4, 10):
228 ... print extract_line("abc\ndefgh\ni", i)
229 abc
230 abc
231 abc
232 defgh
233 defgh
234 i
235 """
236 p = text.rfind('\n', 0, index) + 1
237 q = text.find('\n', index)
Georg Brandl24710632010-10-06 10:47:20 +0000238 if q < 0:
239 q = len(text)
Georg Brandl700cf282009-01-04 10:23:49 +0000240 return text[p:q]
241
242
243class SuspiciousVisitor(nodes.GenericNodeVisitor):
244
245 lastlineno = 0
246
247 def __init__(self, document, builder):
248 nodes.GenericNodeVisitor.__init__(self, document)
249 self.builder = builder
250
251 def default_visit(self, node):
252 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
253 text = node.astext()
254 # lineno seems to go backwards sometimes (?)
255 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
256 seen = set() # don't report the same issue more than only once per line
257 for match in detect_all(text):
Georg Brandl700cf282009-01-04 10:23:49 +0000258 issue = match.group()
259 line = extract_line(text, match.start())
260 if (issue, line) not in seen:
261 self.builder.check_issue(line, lineno, issue)
262 seen.add((issue, line))
263
264 unknown_visit = default_visit
265
266 def visit_document(self, node):
267 self.lastlineno = 0
268
269 def visit_comment(self, node):
270 # ignore comments -- too much false positives.
271 # (although doing this could miss some errors;
272 # there were two sections "commented-out" by mistake
273 # in the Python docs that would not be catched)
274 raise nodes.SkipNode