blob: e3975602fca7d21cc1286af3b1025357ba8891b1 [file] [log] [blame]
Georg Brandl700cf282009-01-04 10:23:49 +00001"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20 - each line defines a rule; if the rule matches, the issue
21 is ignored.
22 - line number may be empty (that is, nothing between the
23 commas: ",,"). In this case, line numbers are ignored (the
24 rule matches anywhere in the file).
25 - the last field does not have to be a complete line; some
26 surrounding text (never more than a line) is enough for
27 context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
Georg Brandl24710632010-10-06 10:47:20 +000044import os
Georg Brandl700cf282009-01-04 10:23:49 +000045import re
Georg Brandl24710632010-10-06 10:47:20 +000046import csv
47import sys
48
Georg Brandl700cf282009-01-04 10:23:49 +000049from docutils import nodes
Benjamin Peterson1a67f582009-01-08 04:01:00 +000050from sphinx.builders import Builder
Georg Brandl700cf282009-01-04 10:23:49 +000051
52detect_all = re.compile(ur'''
53 ::(?=[^=])| # two :: (but NOT ::=)
54 :[a-zA-Z][a-zA-Z0-9]+| # :foo
55 `| # ` (seldom used by itself)
56 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
57 ''', re.UNICODE | re.VERBOSE).finditer
58
Georg Brandl24710632010-10-06 10:47:20 +000059
Georg Brandl700cf282009-01-04 10:23:49 +000060class Rule:
61 def __init__(self, docname, lineno, issue, line):
Georg Brandl24710632010-10-06 10:47:20 +000062 """A rule for ignoring issues"""
Georg Brandl700cf282009-01-04 10:23:49 +000063 self.docname = docname # document to which this rule applies
64 self.lineno = lineno # line number in the original source;
65 # this rule matches only near that.
66 # None -> don't care
67 self.issue = issue # the markup fragment that triggered this rule
68 self.line = line # text of the container element (single line only)
Ezio Melotti144c2692013-03-28 18:01:11 +020069 self.used = False
70
71 def __repr__(self):
72 return '{0.docname},,{0.issue},{0.line}'.format(self)
Georg Brandl700cf282009-01-04 10:23:49 +000073
74
Georg Brandl24710632010-10-06 10:47:20 +000075
76class dialect(csv.excel):
77 """Our dialect: uses only linefeed as newline."""
78 lineterminator = '\n'
79
80
Georg Brandl700cf282009-01-04 10:23:49 +000081class CheckSuspiciousMarkupBuilder(Builder):
82 """
Georg Brandl24710632010-10-06 10:47:20 +000083 Checks for possibly invalid markup that may leak into the output.
Georg Brandl700cf282009-01-04 10:23:49 +000084 """
85 name = 'suspicious'
86
87 def init(self):
88 # create output file
89 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
90 open(self.log_file_name, 'w').close()
91 # load database of previously ignored issues
Georg Brandl24710632010-10-06 10:47:20 +000092 self.load_rules(os.path.join(os.path.dirname(__file__),
93 'susp-ignored.csv'))
Georg Brandl700cf282009-01-04 10:23:49 +000094
95 def get_outdated_docs(self):
96 return self.env.found_docs
97
98 def get_target_uri(self, docname, typ=None):
99 return ''
100
101 def prepare_writing(self, docnames):
Georg Brandl24710632010-10-06 10:47:20 +0000102 pass
Georg Brandl700cf282009-01-04 10:23:49 +0000103
104 def write_doc(self, docname, doctree):
Georg Brandl24710632010-10-06 10:47:20 +0000105 # set when any issue is encountered in this document
106 self.any_issue = False
Georg Brandl700cf282009-01-04 10:23:49 +0000107 self.docname = docname
108 visitor = SuspiciousVisitor(doctree, self)
109 doctree.walk(visitor)
110
111 def finish(self):
Ezio Melotti144c2692013-03-28 18:01:11 +0200112 unused_rules = [rule for rule in self.rules if not rule.used]
113 if unused_rules:
114 self.warn('Found %s/%s unused rules:' %
115 (len(unused_rules), len(self.rules)))
116 for rule in unused_rules:
117 self.info(repr(rule))
Georg Brandl700cf282009-01-04 10:23:49 +0000118 return
119
120 def check_issue(self, line, lineno, issue):
121 if not self.is_ignored(line, lineno, issue):
122 self.report_issue(line, lineno, issue)
123
124 def is_ignored(self, line, lineno, issue):
Georg Brandl24710632010-10-06 10:47:20 +0000125 """Determine whether this issue should be ignored."""
Georg Brandl700cf282009-01-04 10:23:49 +0000126 docname = self.docname
127 for rule in self.rules:
128 if rule.docname != docname: continue
129 if rule.issue != issue: continue
130 # Both lines must match *exactly*. This is rather strict,
131 # and probably should be improved.
132 # Doing fuzzy matches with levenshtein distance could work,
133 # but that means bringing other libraries...
134 # Ok, relax that requirement: just check if the rule fragment
135 # is contained in the document line
136 if rule.line not in line: continue
137 # Check both line numbers. If they're "near"
138 # this rule matches. (lineno=None means "don't care")
139 if (rule.lineno is not None) and \
140 abs(rule.lineno - lineno) > 5: continue
141 # if it came this far, the rule matched
Ezio Melotti144c2692013-03-28 18:01:11 +0200142 rule.used = True
Georg Brandl700cf282009-01-04 10:23:49 +0000143 return True
144 return False
145
146 def report_issue(self, text, lineno, issue):
147 if not self.any_issue: self.info()
148 self.any_issue = True
149 self.write_log_entry(lineno, issue, text)
150 self.warn('[%s:%d] "%s" found in "%-.120s"' % (
151 self.docname.encode(sys.getdefaultencoding(),'replace'),
152 lineno,
153 issue.encode(sys.getdefaultencoding(),'replace'),
154 text.strip().encode(sys.getdefaultencoding(),'replace')))
155 self.app.statuscode = 1
156
157 def write_log_entry(self, lineno, issue, text):
158 f = open(self.log_file_name, 'ab')
Georg Brandl24710632010-10-06 10:47:20 +0000159 writer = csv.writer(f, dialect)
Georg Brandl700cf282009-01-04 10:23:49 +0000160 writer.writerow([self.docname.encode('utf-8'),
Georg Brandl24710632010-10-06 10:47:20 +0000161 lineno,
162 issue.encode('utf-8'),
163 text.strip().encode('utf-8')])
Georg Brandl700cf282009-01-04 10:23:49 +0000164 f.close()
165
166 def load_rules(self, filename):
167 """Load database of previously ignored issues.
168
169 A csv file, with exactly the same format as suspicious.csv
170 Fields: document name (normalized), line number, issue, surrounding text
171 """
172 self.info("loading ignore rules... ", nonl=1)
173 self.rules = rules = []
174 try: f = open(filename, 'rb')
175 except IOError: return
176 for i, row in enumerate(csv.reader(f)):
177 if len(row) != 4:
Georg Brandl24710632010-10-06 10:47:20 +0000178 raise ValueError(
179 "wrong format in %s, line %d: %s" % (filename, i+1, row))
Georg Brandl700cf282009-01-04 10:23:49 +0000180 docname, lineno, issue, text = row
181 docname = docname.decode('utf-8')
182 if lineno: lineno = int(lineno)
183 else: lineno = None
184 issue = issue.decode('utf-8')
185 text = text.decode('utf-8')
186 rule = Rule(docname, lineno, issue, text)
187 rules.append(rule)
188 f.close()
189 self.info('done, %d rules loaded' % len(self.rules))
190
191
192def get_lineno(node):
Georg Brandl24710632010-10-06 10:47:20 +0000193 """Obtain line number information for a node."""
Georg Brandl700cf282009-01-04 10:23:49 +0000194 lineno = None
195 while lineno is None and node:
196 node = node.parent
197 lineno = node.line
198 return lineno
199
200
201def extract_line(text, index):
202 """text may be a multiline string; extract
203 only the line containing the given character index.
204
205 >>> extract_line("abc\ndefgh\ni", 6)
206 >>> 'defgh'
207 >>> for i in (0, 2, 3, 4, 10):
208 ... print extract_line("abc\ndefgh\ni", i)
209 abc
210 abc
211 abc
212 defgh
213 defgh
214 i
215 """
216 p = text.rfind('\n', 0, index) + 1
217 q = text.find('\n', index)
Georg Brandl24710632010-10-06 10:47:20 +0000218 if q < 0:
219 q = len(text)
Georg Brandl700cf282009-01-04 10:23:49 +0000220 return text[p:q]
221
222
223class SuspiciousVisitor(nodes.GenericNodeVisitor):
224
225 lastlineno = 0
226
227 def __init__(self, document, builder):
228 nodes.GenericNodeVisitor.__init__(self, document)
229 self.builder = builder
230
231 def default_visit(self, node):
232 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
233 text = node.astext()
234 # lineno seems to go backwards sometimes (?)
235 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
236 seen = set() # don't report the same issue more than only once per line
237 for match in detect_all(text):
Georg Brandl700cf282009-01-04 10:23:49 +0000238 issue = match.group()
239 line = extract_line(text, match.start())
240 if (issue, line) not in seen:
241 self.builder.check_issue(line, lineno, issue)
242 seen.add((issue, line))
243
244 unknown_visit = default_visit
245
246 def visit_document(self, node):
247 self.lastlineno = 0
248
249 def visit_comment(self, node):
250 # ignore comments -- too much false positives.
251 # (although doing this could miss some errors;
252 # there were two sections "commented-out" by mistake
253 # in the Python docs that would not be catched)
254 raise nodes.SkipNode