blob: 34a0112f5a007b7fba899c66e27415e152ab1d3f [file] [log] [blame]
Benjamin Peterson28d88b42009-01-09 03:03:23 +00001"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20 - each line defines a rule; if the rule matches, the issue
21 is ignored.
22 - line number may be empty (that is, nothing between the
23 commas: ",,"). In this case, line numbers are ignored (the
24 rule matches anywhere in the file).
25 - the last field does not have to be a complete line; some
26 surrounding text (never more than a line) is enough for
27 context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
Georg Brandl19b3e002010-10-06 10:35:24 +000044import os
Benjamin Peterson28d88b42009-01-09 03:03:23 +000045import re
Georg Brandl19b3e002010-10-06 10:35:24 +000046import csv
47import sys
48
Benjamin Peterson28d88b42009-01-09 03:03:23 +000049from docutils import nodes
50from sphinx.builders import Builder
Pablo Galindoee171a22018-10-15 20:07:23 +010051import sphinx.util
Benjamin Peterson28d88b42009-01-09 03:03:23 +000052
Georg Brandla17fd1f2010-10-29 05:30:17 +000053detect_all = re.compile(r'''
Benjamin Peterson28d88b42009-01-09 03:03:23 +000054 ::(?=[^=])| # two :: (but NOT ::=)
55 :[a-zA-Z][a-zA-Z0-9]+| # :foo
56 `| # ` (seldom used by itself)
57 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
58 ''', re.UNICODE | re.VERBOSE).finditer
59
Georg Brandla17fd1f2010-10-29 05:30:17 +000060py3 = sys.version_info >= (3, 0)
61
Georg Brandl19b3e002010-10-06 10:35:24 +000062
Benjamin Peterson28d88b42009-01-09 03:03:23 +000063class Rule:
64 def __init__(self, docname, lineno, issue, line):
Georg Brandl19b3e002010-10-06 10:35:24 +000065 """A rule for ignoring issues"""
Benjamin Peterson28d88b42009-01-09 03:03:23 +000066 self.docname = docname # document to which this rule applies
67 self.lineno = lineno # line number in the original source;
68 # this rule matches only near that.
69 # None -> don't care
70 self.issue = issue # the markup fragment that triggered this rule
71 self.line = line # text of the container element (single line only)
Ezio Melottia5d55ba2013-03-28 17:40:24 +020072 self.used = False
73
74 def __repr__(self):
75 return '{0.docname},,{0.issue},{0.line}'.format(self)
Benjamin Peterson28d88b42009-01-09 03:03:23 +000076
77
Georg Brandl19b3e002010-10-06 10:35:24 +000078
79class dialect(csv.excel):
80 """Our dialect: uses only linefeed as newline."""
81 lineterminator = '\n'
82
83
Benjamin Peterson28d88b42009-01-09 03:03:23 +000084class CheckSuspiciousMarkupBuilder(Builder):
85 """
Georg Brandl19b3e002010-10-06 10:35:24 +000086 Checks for possibly invalid markup that may leak into the output.
Benjamin Peterson28d88b42009-01-09 03:03:23 +000087 """
88 name = 'suspicious'
Pablo Galindoee171a22018-10-15 20:07:23 +010089 logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder")
Benjamin Peterson28d88b42009-01-09 03:03:23 +000090
91 def init(self):
92 # create output file
93 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
94 open(self.log_file_name, 'w').close()
95 # load database of previously ignored issues
Georg Brandle0390292014-10-29 08:07:37 +010096 self.load_rules(os.path.join(os.path.dirname(__file__), '..',
Georg Brandl19b3e002010-10-06 10:35:24 +000097 'susp-ignored.csv'))
Benjamin Peterson28d88b42009-01-09 03:03:23 +000098
99 def get_outdated_docs(self):
100 return self.env.found_docs
101
102 def get_target_uri(self, docname, typ=None):
103 return ''
104
105 def prepare_writing(self, docnames):
Georg Brandl19b3e002010-10-06 10:35:24 +0000106 pass
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000107
108 def write_doc(self, docname, doctree):
Georg Brandl19b3e002010-10-06 10:35:24 +0000109 # set when any issue is encountered in this document
110 self.any_issue = False
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000111 self.docname = docname
112 visitor = SuspiciousVisitor(doctree, self)
113 doctree.walk(visitor)
114
115 def finish(self):
Ezio Melottia5d55ba2013-03-28 17:40:24 +0200116 unused_rules = [rule for rule in self.rules if not rule.used]
117 if unused_rules:
Steve Dower60419a72019-06-24 08:42:54 -0700118 self.logger.warn('Found %s/%s unused rules:' %
119 (len(unused_rules), len(self.rules)))
Ezio Melottia5d55ba2013-03-28 17:40:24 +0200120 for rule in unused_rules:
Pablo Galindoee171a22018-10-15 20:07:23 +0100121 self.logger.info(repr(rule))
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000122 return
123
124 def check_issue(self, line, lineno, issue):
125 if not self.is_ignored(line, lineno, issue):
126 self.report_issue(line, lineno, issue)
127
128 def is_ignored(self, line, lineno, issue):
Georg Brandl19b3e002010-10-06 10:35:24 +0000129 """Determine whether this issue should be ignored."""
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000130 docname = self.docname
131 for rule in self.rules:
132 if rule.docname != docname: continue
133 if rule.issue != issue: continue
134 # Both lines must match *exactly*. This is rather strict,
135 # and probably should be improved.
136 # Doing fuzzy matches with levenshtein distance could work,
137 # but that means bringing other libraries...
138 # Ok, relax that requirement: just check if the rule fragment
139 # is contained in the document line
140 if rule.line not in line: continue
141 # Check both line numbers. If they're "near"
142 # this rule matches. (lineno=None means "don't care")
143 if (rule.lineno is not None) and \
144 abs(rule.lineno - lineno) > 5: continue
145 # if it came this far, the rule matched
Ezio Melottia5d55ba2013-03-28 17:40:24 +0200146 rule.used = True
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000147 return True
148 return False
149
150 def report_issue(self, text, lineno, issue):
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000151 self.any_issue = True
152 self.write_log_entry(lineno, issue, text)
Georg Brandla17fd1f2010-10-29 05:30:17 +0000153 if py3:
Steve Dower60419a72019-06-24 08:42:54 -0700154 self.logger.warn('[%s:%d] "%s" found in "%-.120s"' %
155 (self.docname, lineno, issue, text))
Georg Brandla17fd1f2010-10-29 05:30:17 +0000156 else:
Steve Dower60419a72019-06-24 08:42:54 -0700157 self.logger.warn('[%s:%d] "%s" found in "%-.120s"' % (
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000158 self.docname.encode(sys.getdefaultencoding(),'replace'),
159 lineno,
160 issue.encode(sys.getdefaultencoding(),'replace'),
161 text.strip().encode(sys.getdefaultencoding(),'replace')))
162 self.app.statuscode = 1
163
164 def write_log_entry(self, lineno, issue, text):
Georg Brandla17fd1f2010-10-29 05:30:17 +0000165 if py3:
166 f = open(self.log_file_name, 'a')
167 writer = csv.writer(f, dialect)
168 writer.writerow([self.docname, lineno, issue, text.strip()])
169 f.close()
170 else:
171 f = open(self.log_file_name, 'ab')
172 writer = csv.writer(f, dialect)
173 writer.writerow([self.docname.encode('utf-8'),
174 lineno,
175 issue.encode('utf-8'),
176 text.strip().encode('utf-8')])
177 f.close()
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000178
179 def load_rules(self, filename):
180 """Load database of previously ignored issues.
181
182 A csv file, with exactly the same format as suspicious.csv
183 Fields: document name (normalized), line number, issue, surrounding text
184 """
Pablo Galindoee171a22018-10-15 20:07:23 +0100185 self.logger.info("loading ignore rules... ", nonl=1)
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000186 self.rules = rules = []
Georg Brandla17fd1f2010-10-29 05:30:17 +0000187 try:
188 if py3:
189 f = open(filename, 'r')
190 else:
191 f = open(filename, 'rb')
192 except IOError:
193 return
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000194 for i, row in enumerate(csv.reader(f)):
195 if len(row) != 4:
Georg Brandl19b3e002010-10-06 10:35:24 +0000196 raise ValueError(
197 "wrong format in %s, line %d: %s" % (filename, i+1, row))
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000198 docname, lineno, issue, text = row
Georg Brandla17fd1f2010-10-29 05:30:17 +0000199 if lineno:
200 lineno = int(lineno)
201 else:
202 lineno = None
203 if not py3:
204 docname = docname.decode('utf-8')
205 issue = issue.decode('utf-8')
206 text = text.decode('utf-8')
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000207 rule = Rule(docname, lineno, issue, text)
208 rules.append(rule)
209 f.close()
Pablo Galindoee171a22018-10-15 20:07:23 +0100210 self.logger.info('done, %d rules loaded' % len(self.rules))
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000211
212
213def get_lineno(node):
Georg Brandl19b3e002010-10-06 10:35:24 +0000214 """Obtain line number information for a node."""
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000215 lineno = None
216 while lineno is None and node:
217 node = node.parent
218 lineno = node.line
219 return lineno
220
221
222def extract_line(text, index):
223 """text may be a multiline string; extract
224 only the line containing the given character index.
225
226 >>> extract_line("abc\ndefgh\ni", 6)
227 >>> 'defgh'
228 >>> for i in (0, 2, 3, 4, 10):
229 ... print extract_line("abc\ndefgh\ni", i)
230 abc
231 abc
232 abc
233 defgh
234 defgh
235 i
236 """
237 p = text.rfind('\n', 0, index) + 1
238 q = text.find('\n', index)
Georg Brandl19b3e002010-10-06 10:35:24 +0000239 if q < 0:
240 q = len(text)
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000241 return text[p:q]
242
243
244class SuspiciousVisitor(nodes.GenericNodeVisitor):
245
246 lastlineno = 0
247
248 def __init__(self, document, builder):
249 nodes.GenericNodeVisitor.__init__(self, document)
250 self.builder = builder
251
252 def default_visit(self, node):
253 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
254 text = node.astext()
255 # lineno seems to go backwards sometimes (?)
256 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
257 seen = set() # don't report the same issue more than only once per line
258 for match in detect_all(text):
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000259 issue = match.group()
260 line = extract_line(text, match.start())
261 if (issue, line) not in seen:
262 self.builder.check_issue(line, lineno, issue)
263 seen.add((issue, line))
264
265 unknown_visit = default_visit
266
267 def visit_document(self, node):
268 self.lastlineno = 0
269
270 def visit_comment(self, node):
271 # ignore comments -- too much false positives.
272 # (although doing this could miss some errors;
273 # there were two sections "commented-out" by mistake
Martin Panter2275e622016-06-20 07:52:50 +0000274 # in the Python docs that would not be caught)
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000275 raise nodes.SkipNode