blob: 0a70e57d2b044f09db57913c22c899ebb79197b1 [file] [log] [blame]
Benjamin Peterson28d88b42009-01-09 03:03:23 +00001"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20 - each line defines a rule; if the rule matches, the issue
21 is ignored.
22 - line number may be empty (that is, nothing between the
23 commas: ",,"). In this case, line numbers are ignored (the
24 rule matches anywhere in the file).
25 - the last field does not have to be a complete line; some
26 surrounding text (never more than a line) is enough for
27 context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
Georg Brandl19b3e002010-10-06 10:35:24 +000044import os
Benjamin Peterson28d88b42009-01-09 03:03:23 +000045import re
Georg Brandl19b3e002010-10-06 10:35:24 +000046import csv
47import sys
48
Benjamin Peterson28d88b42009-01-09 03:03:23 +000049from docutils import nodes
50from sphinx.builders import Builder
51
Georg Brandla17fd1f2010-10-29 05:30:17 +000052detect_all = re.compile(r'''
Benjamin Peterson28d88b42009-01-09 03:03:23 +000053 ::(?=[^=])| # two :: (but NOT ::=)
54 :[a-zA-Z][a-zA-Z0-9]+| # :foo
55 `| # ` (seldom used by itself)
56 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
57 ''', re.UNICODE | re.VERBOSE).finditer
58
Georg Brandla17fd1f2010-10-29 05:30:17 +000059py3 = sys.version_info >= (3, 0)
60
Georg Brandl19b3e002010-10-06 10:35:24 +000061
Benjamin Peterson28d88b42009-01-09 03:03:23 +000062class Rule:
63 def __init__(self, docname, lineno, issue, line):
Georg Brandl19b3e002010-10-06 10:35:24 +000064 """A rule for ignoring issues"""
Benjamin Peterson28d88b42009-01-09 03:03:23 +000065 self.docname = docname # document to which this rule applies
66 self.lineno = lineno # line number in the original source;
67 # this rule matches only near that.
68 # None -> don't care
69 self.issue = issue # the markup fragment that triggered this rule
70 self.line = line # text of the container element (single line only)
Ezio Melottia5d55ba2013-03-28 17:40:24 +020071 self.used = False
72
73 def __repr__(self):
74 return '{0.docname},,{0.issue},{0.line}'.format(self)
Benjamin Peterson28d88b42009-01-09 03:03:23 +000075
76
Georg Brandl19b3e002010-10-06 10:35:24 +000077
78class dialect(csv.excel):
79 """Our dialect: uses only linefeed as newline."""
80 lineterminator = '\n'
81
82
Benjamin Peterson28d88b42009-01-09 03:03:23 +000083class CheckSuspiciousMarkupBuilder(Builder):
84 """
Georg Brandl19b3e002010-10-06 10:35:24 +000085 Checks for possibly invalid markup that may leak into the output.
Benjamin Peterson28d88b42009-01-09 03:03:23 +000086 """
87 name = 'suspicious'
88
89 def init(self):
90 # create output file
91 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
92 open(self.log_file_name, 'w').close()
93 # load database of previously ignored issues
Georg Brandle0390292014-10-29 08:07:37 +010094 self.load_rules(os.path.join(os.path.dirname(__file__), '..',
Georg Brandl19b3e002010-10-06 10:35:24 +000095 'susp-ignored.csv'))
Benjamin Peterson28d88b42009-01-09 03:03:23 +000096
97 def get_outdated_docs(self):
98 return self.env.found_docs
99
100 def get_target_uri(self, docname, typ=None):
101 return ''
102
103 def prepare_writing(self, docnames):
Georg Brandl19b3e002010-10-06 10:35:24 +0000104 pass
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000105
106 def write_doc(self, docname, doctree):
Georg Brandl19b3e002010-10-06 10:35:24 +0000107 # set when any issue is encountered in this document
108 self.any_issue = False
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000109 self.docname = docname
110 visitor = SuspiciousVisitor(doctree, self)
111 doctree.walk(visitor)
112
113 def finish(self):
Ezio Melottia5d55ba2013-03-28 17:40:24 +0200114 unused_rules = [rule for rule in self.rules if not rule.used]
115 if unused_rules:
116 self.warn('Found %s/%s unused rules:' %
117 (len(unused_rules), len(self.rules)))
118 for rule in unused_rules:
119 self.info(repr(rule))
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000120 return
121
122 def check_issue(self, line, lineno, issue):
123 if not self.is_ignored(line, lineno, issue):
124 self.report_issue(line, lineno, issue)
125
126 def is_ignored(self, line, lineno, issue):
Georg Brandl19b3e002010-10-06 10:35:24 +0000127 """Determine whether this issue should be ignored."""
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000128 docname = self.docname
129 for rule in self.rules:
130 if rule.docname != docname: continue
131 if rule.issue != issue: continue
132 # Both lines must match *exactly*. This is rather strict,
133 # and probably should be improved.
134 # Doing fuzzy matches with levenshtein distance could work,
135 # but that means bringing other libraries...
136 # Ok, relax that requirement: just check if the rule fragment
137 # is contained in the document line
138 if rule.line not in line: continue
139 # Check both line numbers. If they're "near"
140 # this rule matches. (lineno=None means "don't care")
141 if (rule.lineno is not None) and \
142 abs(rule.lineno - lineno) > 5: continue
143 # if it came this far, the rule matched
Ezio Melottia5d55ba2013-03-28 17:40:24 +0200144 rule.used = True
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000145 return True
146 return False
147
148 def report_issue(self, text, lineno, issue):
149 if not self.any_issue: self.info()
150 self.any_issue = True
151 self.write_log_entry(lineno, issue, text)
Georg Brandla17fd1f2010-10-29 05:30:17 +0000152 if py3:
153 self.warn('[%s:%d] "%s" found in "%-.120s"' %
154 (self.docname, lineno, issue, text))
155 else:
156 self.warn('[%s:%d] "%s" found in "%-.120s"' % (
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000157 self.docname.encode(sys.getdefaultencoding(),'replace'),
158 lineno,
159 issue.encode(sys.getdefaultencoding(),'replace'),
160 text.strip().encode(sys.getdefaultencoding(),'replace')))
161 self.app.statuscode = 1
162
163 def write_log_entry(self, lineno, issue, text):
Georg Brandla17fd1f2010-10-29 05:30:17 +0000164 if py3:
165 f = open(self.log_file_name, 'a')
166 writer = csv.writer(f, dialect)
167 writer.writerow([self.docname, lineno, issue, text.strip()])
168 f.close()
169 else:
170 f = open(self.log_file_name, 'ab')
171 writer = csv.writer(f, dialect)
172 writer.writerow([self.docname.encode('utf-8'),
173 lineno,
174 issue.encode('utf-8'),
175 text.strip().encode('utf-8')])
176 f.close()
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000177
178 def load_rules(self, filename):
179 """Load database of previously ignored issues.
180
181 A csv file, with exactly the same format as suspicious.csv
182 Fields: document name (normalized), line number, issue, surrounding text
183 """
184 self.info("loading ignore rules... ", nonl=1)
185 self.rules = rules = []
Georg Brandla17fd1f2010-10-29 05:30:17 +0000186 try:
187 if py3:
188 f = open(filename, 'r')
189 else:
190 f = open(filename, 'rb')
191 except IOError:
192 return
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000193 for i, row in enumerate(csv.reader(f)):
194 if len(row) != 4:
Georg Brandl19b3e002010-10-06 10:35:24 +0000195 raise ValueError(
196 "wrong format in %s, line %d: %s" % (filename, i+1, row))
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000197 docname, lineno, issue, text = row
Georg Brandla17fd1f2010-10-29 05:30:17 +0000198 if lineno:
199 lineno = int(lineno)
200 else:
201 lineno = None
202 if not py3:
203 docname = docname.decode('utf-8')
204 issue = issue.decode('utf-8')
205 text = text.decode('utf-8')
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000206 rule = Rule(docname, lineno, issue, text)
207 rules.append(rule)
208 f.close()
209 self.info('done, %d rules loaded' % len(self.rules))
210
211
212def get_lineno(node):
Georg Brandl19b3e002010-10-06 10:35:24 +0000213 """Obtain line number information for a node."""
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000214 lineno = None
215 while lineno is None and node:
216 node = node.parent
217 lineno = node.line
218 return lineno
219
220
221def extract_line(text, index):
222 """text may be a multiline string; extract
223 only the line containing the given character index.
224
225 >>> extract_line("abc\ndefgh\ni", 6)
226 >>> 'defgh'
227 >>> for i in (0, 2, 3, 4, 10):
228 ... print extract_line("abc\ndefgh\ni", i)
229 abc
230 abc
231 abc
232 defgh
233 defgh
234 i
235 """
236 p = text.rfind('\n', 0, index) + 1
237 q = text.find('\n', index)
Georg Brandl19b3e002010-10-06 10:35:24 +0000238 if q < 0:
239 q = len(text)
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000240 return text[p:q]
241
242
243class SuspiciousVisitor(nodes.GenericNodeVisitor):
244
245 lastlineno = 0
246
247 def __init__(self, document, builder):
248 nodes.GenericNodeVisitor.__init__(self, document)
249 self.builder = builder
250
251 def default_visit(self, node):
252 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
253 text = node.astext()
254 # lineno seems to go backwards sometimes (?)
255 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
256 seen = set() # don't report the same issue more than only once per line
257 for match in detect_all(text):
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000258 issue = match.group()
259 line = extract_line(text, match.start())
260 if (issue, line) not in seen:
261 self.builder.check_issue(line, lineno, issue)
262 seen.add((issue, line))
263
264 unknown_visit = default_visit
265
266 def visit_document(self, node):
267 self.lastlineno = 0
268
269 def visit_comment(self, node):
270 # ignore comments -- too much false positives.
271 # (although doing this could miss some errors;
272 # there were two sections "commented-out" by mistake
Martin Panter2275e622016-06-20 07:52:50 +0000273 # in the Python docs that would not be caught)
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000274 raise nodes.SkipNode