blob: 888b2318b32e1b87a67e1f8aa2f87d8ca620f515 [file] [log] [blame]
Benjamin Peterson28d88b42009-01-09 03:03:23 +00001"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20 - each line defines a rule; if the rule matches, the issue
21 is ignored.
22 - line number may be empty (that is, nothing between the
23 commas: ",,"). In this case, line numbers are ignored (the
24 rule matches anywhere in the file).
25 - the last field does not have to be a complete line; some
26 surrounding text (never more than a line) is enough for
27 context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
Georg Brandl19b3e002010-10-06 10:35:24 +000044import os
Benjamin Peterson28d88b42009-01-09 03:03:23 +000045import re
Georg Brandl19b3e002010-10-06 10:35:24 +000046import csv
47import sys
48
Benjamin Peterson28d88b42009-01-09 03:03:23 +000049from docutils import nodes
50from sphinx.builders import Builder
51
Georg Brandla17fd1f2010-10-29 05:30:17 +000052detect_all = re.compile(r'''
Benjamin Peterson28d88b42009-01-09 03:03:23 +000053 ::(?=[^=])| # two :: (but NOT ::=)
54 :[a-zA-Z][a-zA-Z0-9]+| # :foo
55 `| # ` (seldom used by itself)
56 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
57 ''', re.UNICODE | re.VERBOSE).finditer
58
Georg Brandla17fd1f2010-10-29 05:30:17 +000059py3 = sys.version_info >= (3, 0)
60
Georg Brandl19b3e002010-10-06 10:35:24 +000061
Benjamin Peterson28d88b42009-01-09 03:03:23 +000062class Rule:
63 def __init__(self, docname, lineno, issue, line):
Georg Brandl19b3e002010-10-06 10:35:24 +000064 """A rule for ignoring issues"""
Benjamin Peterson28d88b42009-01-09 03:03:23 +000065 self.docname = docname # document to which this rule applies
66 self.lineno = lineno # line number in the original source;
67 # this rule matches only near that.
68 # None -> don't care
69 self.issue = issue # the markup fragment that triggered this rule
70 self.line = line # text of the container element (single line only)
71
72
Georg Brandl19b3e002010-10-06 10:35:24 +000073
74class dialect(csv.excel):
75 """Our dialect: uses only linefeed as newline."""
76 lineterminator = '\n'
77
78
Benjamin Peterson28d88b42009-01-09 03:03:23 +000079class CheckSuspiciousMarkupBuilder(Builder):
80 """
Georg Brandl19b3e002010-10-06 10:35:24 +000081 Checks for possibly invalid markup that may leak into the output.
Benjamin Peterson28d88b42009-01-09 03:03:23 +000082 """
83 name = 'suspicious'
84
85 def init(self):
86 # create output file
87 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
88 open(self.log_file_name, 'w').close()
89 # load database of previously ignored issues
Georg Brandl19b3e002010-10-06 10:35:24 +000090 self.load_rules(os.path.join(os.path.dirname(__file__),
91 'susp-ignored.csv'))
Benjamin Peterson28d88b42009-01-09 03:03:23 +000092
93 def get_outdated_docs(self):
94 return self.env.found_docs
95
96 def get_target_uri(self, docname, typ=None):
97 return ''
98
99 def prepare_writing(self, docnames):
Georg Brandl19b3e002010-10-06 10:35:24 +0000100 pass
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000101
102 def write_doc(self, docname, doctree):
Georg Brandl19b3e002010-10-06 10:35:24 +0000103 # set when any issue is encountered in this document
104 self.any_issue = False
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000105 self.docname = docname
106 visitor = SuspiciousVisitor(doctree, self)
107 doctree.walk(visitor)
108
109 def finish(self):
110 return
111
112 def check_issue(self, line, lineno, issue):
113 if not self.is_ignored(line, lineno, issue):
114 self.report_issue(line, lineno, issue)
115
116 def is_ignored(self, line, lineno, issue):
Georg Brandl19b3e002010-10-06 10:35:24 +0000117 """Determine whether this issue should be ignored."""
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000118 docname = self.docname
119 for rule in self.rules:
120 if rule.docname != docname: continue
121 if rule.issue != issue: continue
122 # Both lines must match *exactly*. This is rather strict,
123 # and probably should be improved.
124 # Doing fuzzy matches with levenshtein distance could work,
125 # but that means bringing other libraries...
126 # Ok, relax that requirement: just check if the rule fragment
127 # is contained in the document line
128 if rule.line not in line: continue
129 # Check both line numbers. If they're "near"
130 # this rule matches. (lineno=None means "don't care")
131 if (rule.lineno is not None) and \
132 abs(rule.lineno - lineno) > 5: continue
133 # if it came this far, the rule matched
134 return True
135 return False
136
137 def report_issue(self, text, lineno, issue):
138 if not self.any_issue: self.info()
139 self.any_issue = True
140 self.write_log_entry(lineno, issue, text)
Georg Brandla17fd1f2010-10-29 05:30:17 +0000141 if py3:
142 self.warn('[%s:%d] "%s" found in "%-.120s"' %
143 (self.docname, lineno, issue, text))
144 else:
145 self.warn('[%s:%d] "%s" found in "%-.120s"' % (
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000146 self.docname.encode(sys.getdefaultencoding(),'replace'),
147 lineno,
148 issue.encode(sys.getdefaultencoding(),'replace'),
149 text.strip().encode(sys.getdefaultencoding(),'replace')))
150 self.app.statuscode = 1
151
152 def write_log_entry(self, lineno, issue, text):
Georg Brandla17fd1f2010-10-29 05:30:17 +0000153 if py3:
154 f = open(self.log_file_name, 'a')
155 writer = csv.writer(f, dialect)
156 writer.writerow([self.docname, lineno, issue, text.strip()])
157 f.close()
158 else:
159 f = open(self.log_file_name, 'ab')
160 writer = csv.writer(f, dialect)
161 writer.writerow([self.docname.encode('utf-8'),
162 lineno,
163 issue.encode('utf-8'),
164 text.strip().encode('utf-8')])
165 f.close()
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000166
167 def load_rules(self, filename):
168 """Load database of previously ignored issues.
169
170 A csv file, with exactly the same format as suspicious.csv
171 Fields: document name (normalized), line number, issue, surrounding text
172 """
173 self.info("loading ignore rules... ", nonl=1)
174 self.rules = rules = []
Georg Brandla17fd1f2010-10-29 05:30:17 +0000175 try:
176 if py3:
177 f = open(filename, 'r')
178 else:
179 f = open(filename, 'rb')
180 except IOError:
181 return
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000182 for i, row in enumerate(csv.reader(f)):
183 if len(row) != 4:
Georg Brandl19b3e002010-10-06 10:35:24 +0000184 raise ValueError(
185 "wrong format in %s, line %d: %s" % (filename, i+1, row))
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000186 docname, lineno, issue, text = row
Georg Brandla17fd1f2010-10-29 05:30:17 +0000187 if lineno:
188 lineno = int(lineno)
189 else:
190 lineno = None
191 if not py3:
192 docname = docname.decode('utf-8')
193 issue = issue.decode('utf-8')
194 text = text.decode('utf-8')
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000195 rule = Rule(docname, lineno, issue, text)
196 rules.append(rule)
197 f.close()
198 self.info('done, %d rules loaded' % len(self.rules))
199
200
201def get_lineno(node):
Georg Brandl19b3e002010-10-06 10:35:24 +0000202 """Obtain line number information for a node."""
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000203 lineno = None
204 while lineno is None and node:
205 node = node.parent
206 lineno = node.line
207 return lineno
208
209
210def extract_line(text, index):
211 """text may be a multiline string; extract
212 only the line containing the given character index.
213
214 >>> extract_line("abc\ndefgh\ni", 6)
215 >>> 'defgh'
216 >>> for i in (0, 2, 3, 4, 10):
217 ... print extract_line("abc\ndefgh\ni", i)
218 abc
219 abc
220 abc
221 defgh
222 defgh
223 i
224 """
225 p = text.rfind('\n', 0, index) + 1
226 q = text.find('\n', index)
Georg Brandl19b3e002010-10-06 10:35:24 +0000227 if q < 0:
228 q = len(text)
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000229 return text[p:q]
230
231
232class SuspiciousVisitor(nodes.GenericNodeVisitor):
233
234 lastlineno = 0
235
236 def __init__(self, document, builder):
237 nodes.GenericNodeVisitor.__init__(self, document)
238 self.builder = builder
239
240 def default_visit(self, node):
241 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
242 text = node.astext()
243 # lineno seems to go backwards sometimes (?)
244 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
245 seen = set() # don't report the same issue more than only once per line
246 for match in detect_all(text):
Benjamin Peterson28d88b42009-01-09 03:03:23 +0000247 issue = match.group()
248 line = extract_line(text, match.start())
249 if (issue, line) not in seen:
250 self.builder.check_issue(line, lineno, issue)
251 seen.add((issue, line))
252
253 unknown_visit = default_visit
254
255 def visit_document(self, node):
256 self.lastlineno = 0
257
258 def visit_comment(self, node):
259 # ignore comments -- too much false positives.
260 # (although doing this could miss some errors;
261 # there were two sections "commented-out" by mistake
262 # in the Python docs that would not be catched)
263 raise nodes.SkipNode