blob: 9e814fb94d2b56f596a40055855249b6430bdb5f [file] [log] [blame]
Benjamin Peterson28d88b42009-01-09 03:03:23 +00001"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20 - each line defines a rule; if the rule matches, the issue
21 is ignored.
22 - line number may be empty (that is, nothing between the
23 commas: ",,"). In this case, line numbers are ignored (the
24 rule matches anywhere in the file).
25 - the last field does not have to be a complete line; some
26 surrounding text (never more than a line) is enough for
27 context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
Georg Brandl19b3e002010-10-06 10:35:24 +000044import os
Benjamin Peterson28d88b42009-01-09 03:03:23 +000045import re
Georg Brandl19b3e002010-10-06 10:35:24 +000046import csv
47import sys
48
Benjamin Peterson28d88b42009-01-09 03:03:23 +000049from docutils import nodes
50from sphinx.builders import Builder
Pablo Galindoee171a22018-10-15 20:07:23 +010051import sphinx.util
Benjamin Peterson28d88b42009-01-09 03:03:23 +000052
Georg Brandla17fd1f2010-10-29 05:30:17 +000053detect_all = re.compile(r'''
Benjamin Peterson28d88b42009-01-09 03:03:23 +000054 ::(?=[^=])| # two :: (but NOT ::=)
55 :[a-zA-Z][a-zA-Z0-9]+| # :foo
56 `| # ` (seldom used by itself)
57 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
58 ''', re.UNICODE | re.VERBOSE).finditer
59
Georg Brandla17fd1f2010-10-29 05:30:17 +000060py3 = sys.version_info >= (3, 0)
61
Georg Brandl19b3e002010-10-06 10:35:24 +000062
Benjamin Peterson28d88b42009-01-09 03:03:23 +000063class Rule:
64 def __init__(self, docname, lineno, issue, line):
Georg Brandl19b3e002010-10-06 10:35:24 +000065 """A rule for ignoring issues"""
Benjamin Peterson28d88b42009-01-09 03:03:23 +000066 self.docname = docname # document to which this rule applies
67 self.lineno = lineno # line number in the original source;
68 # this rule matches only near that.
69 # None -> don't care
70 self.issue = issue # the markup fragment that triggered this rule
71 self.line = line # text of the container element (single line only)
Ezio Melottia5d55ba2013-03-28 17:40:24 +020072 self.used = False
73
74 def __repr__(self):
75 return '{0.docname},,{0.issue},{0.line}'.format(self)
Benjamin Peterson28d88b42009-01-09 03:03:23 +000076
77
Georg Brandl19b3e002010-10-06 10:35:24 +000078
79class dialect(csv.excel):
80 """Our dialect: uses only linefeed as newline."""
81 lineterminator = '\n'
82
83
Benjamin Peterson28d88b42009-01-09 03:03:23 +000084class CheckSuspiciousMarkupBuilder(Builder):
85 """
Georg Brandl19b3e002010-10-06 10:35:24 +000086 Checks for possibly invalid markup that may leak into the output.
Benjamin Peterson28d88b42009-01-09 03:03:23 +000087 """
88 name = 'suspicious'
Pablo Galindoee171a22018-10-15 20:07:23 +010089 logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder")
Benjamin Peterson28d88b42009-01-09 03:03:23 +000090
91 def init(self):
92 # create output file
93 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
94 open(self.log_file_name, 'w').close()
95 # load database of previously ignored issues
Georg Brandle0390292014-10-29 08:07:37 +010096 self.load_rules(os.path.join(os.path.dirname(__file__), '..',
Georg Brandl19b3e002010-10-06 10:35:24 +000097 'susp-ignored.csv'))
Benjamin Peterson28d88b42009-01-09 03:03:23 +000098
99 def get_outdated_docs(self):
100 return self.env.found_docs
101
102 def get_target_uri(self, docname, typ=None):
103 return ''
104
105 def prepare_writing(self, docnames):
Georg Brandl19b3e002010-10-06 10:35:24 +0000106 pass
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000107
108 def write_doc(self, docname, doctree):
Georg Brandl19b3e002010-10-06 10:35:24 +0000109 # set when any issue is encountered in this document
110 self.any_issue = False
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000111 self.docname = docname
112 visitor = SuspiciousVisitor(doctree, self)
113 doctree.walk(visitor)
114
115 def finish(self):
Ezio Melottia5d55ba2013-03-28 17:40:24 +0200116 unused_rules = [rule for rule in self.rules if not rule.used]
117 if unused_rules:
Anthony Sottilee1786b52019-09-02 09:01:23 -0700118 self.logger.warning(
119 'Found %s/%s unused rules: %s' % (
120 len(unused_rules), len(self.rules),
121 ''.join(repr(rule) for rule in unused_rules),
122 )
123 )
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000124 return
125
126 def check_issue(self, line, lineno, issue):
127 if not self.is_ignored(line, lineno, issue):
128 self.report_issue(line, lineno, issue)
129
130 def is_ignored(self, line, lineno, issue):
Georg Brandl19b3e002010-10-06 10:35:24 +0000131 """Determine whether this issue should be ignored."""
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000132 docname = self.docname
133 for rule in self.rules:
134 if rule.docname != docname: continue
135 if rule.issue != issue: continue
136 # Both lines must match *exactly*. This is rather strict,
137 # and probably should be improved.
138 # Doing fuzzy matches with levenshtein distance could work,
139 # but that means bringing other libraries...
140 # Ok, relax that requirement: just check if the rule fragment
141 # is contained in the document line
142 if rule.line not in line: continue
143 # Check both line numbers. If they're "near"
144 # this rule matches. (lineno=None means "don't care")
145 if (rule.lineno is not None) and \
146 abs(rule.lineno - lineno) > 5: continue
147 # if it came this far, the rule matched
Ezio Melottia5d55ba2013-03-28 17:40:24 +0200148 rule.used = True
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000149 return True
150 return False
151
152 def report_issue(self, text, lineno, issue):
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000153 self.any_issue = True
154 self.write_log_entry(lineno, issue, text)
Georg Brandla17fd1f2010-10-29 05:30:17 +0000155 if py3:
Anthony Sottilee1786b52019-09-02 09:01:23 -0700156 self.logger.warning('[%s:%d] "%s" found in "%-.120s"' %
157 (self.docname, lineno, issue, text))
Georg Brandla17fd1f2010-10-29 05:30:17 +0000158 else:
Anthony Sottilee1786b52019-09-02 09:01:23 -0700159 self.logger.warning(
160 '[%s:%d] "%s" found in "%-.120s"' % (
161 self.docname.encode(sys.getdefaultencoding(),'replace'),
162 lineno,
163 issue.encode(sys.getdefaultencoding(),'replace'),
164 text.strip().encode(sys.getdefaultencoding(),'replace')))
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000165 self.app.statuscode = 1
166
167 def write_log_entry(self, lineno, issue, text):
Georg Brandla17fd1f2010-10-29 05:30:17 +0000168 if py3:
169 f = open(self.log_file_name, 'a')
170 writer = csv.writer(f, dialect)
171 writer.writerow([self.docname, lineno, issue, text.strip()])
172 f.close()
173 else:
174 f = open(self.log_file_name, 'ab')
175 writer = csv.writer(f, dialect)
176 writer.writerow([self.docname.encode('utf-8'),
177 lineno,
178 issue.encode('utf-8'),
179 text.strip().encode('utf-8')])
180 f.close()
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000181
182 def load_rules(self, filename):
183 """Load database of previously ignored issues.
184
185 A csv file, with exactly the same format as suspicious.csv
186 Fields: document name (normalized), line number, issue, surrounding text
187 """
Pablo Galindoee171a22018-10-15 20:07:23 +0100188 self.logger.info("loading ignore rules... ", nonl=1)
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000189 self.rules = rules = []
Georg Brandla17fd1f2010-10-29 05:30:17 +0000190 try:
191 if py3:
192 f = open(filename, 'r')
193 else:
194 f = open(filename, 'rb')
195 except IOError:
196 return
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000197 for i, row in enumerate(csv.reader(f)):
198 if len(row) != 4:
Georg Brandl19b3e002010-10-06 10:35:24 +0000199 raise ValueError(
200 "wrong format in %s, line %d: %s" % (filename, i+1, row))
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000201 docname, lineno, issue, text = row
Georg Brandla17fd1f2010-10-29 05:30:17 +0000202 if lineno:
203 lineno = int(lineno)
204 else:
205 lineno = None
206 if not py3:
207 docname = docname.decode('utf-8')
208 issue = issue.decode('utf-8')
209 text = text.decode('utf-8')
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000210 rule = Rule(docname, lineno, issue, text)
211 rules.append(rule)
212 f.close()
Pablo Galindoee171a22018-10-15 20:07:23 +0100213 self.logger.info('done, %d rules loaded' % len(self.rules))
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000214
215
216def get_lineno(node):
Georg Brandl19b3e002010-10-06 10:35:24 +0000217 """Obtain line number information for a node."""
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000218 lineno = None
219 while lineno is None and node:
220 node = node.parent
221 lineno = node.line
222 return lineno
223
224
225def extract_line(text, index):
226 """text may be a multiline string; extract
227 only the line containing the given character index.
228
229 >>> extract_line("abc\ndefgh\ni", 6)
230 >>> 'defgh'
231 >>> for i in (0, 2, 3, 4, 10):
232 ... print extract_line("abc\ndefgh\ni", i)
233 abc
234 abc
235 abc
236 defgh
237 defgh
238 i
239 """
240 p = text.rfind('\n', 0, index) + 1
241 q = text.find('\n', index)
Georg Brandl19b3e002010-10-06 10:35:24 +0000242 if q < 0:
243 q = len(text)
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000244 return text[p:q]
245
246
247class SuspiciousVisitor(nodes.GenericNodeVisitor):
248
249 lastlineno = 0
250
251 def __init__(self, document, builder):
252 nodes.GenericNodeVisitor.__init__(self, document)
253 self.builder = builder
254
255 def default_visit(self, node):
256 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
257 text = node.astext()
258 # lineno seems to go backwards sometimes (?)
259 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
260 seen = set() # don't report the same issue more than only once per line
261 for match in detect_all(text):
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000262 issue = match.group()
263 line = extract_line(text, match.start())
264 if (issue, line) not in seen:
265 self.builder.check_issue(line, lineno, issue)
266 seen.add((issue, line))
267
268 unknown_visit = default_visit
269
270 def visit_document(self, node):
271 self.lastlineno = 0
272
273 def visit_comment(self, node):
274 # ignore comments -- too much false positives.
275 # (although doing this could miss some errors;
276 # there were two sections "commented-out" by mistake
Martin Panter2275e622016-06-20 07:52:50 +0000277 # in the Python docs that would not be caught)
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000278 raise nodes.SkipNode