blob: 245a759bed921fe4d59a23129a77d635e1c722b4 [file] [log] [blame]
Benjamin Peterson9f7ae1b2009-01-09 03:04:01 +00001"""
2Try to detect suspicious constructs, resembling markup
3that has leaked into the final output.
4
5Suspicious lines are reported in a comma-separated-file,
6``suspicious.csv``, located in the output directory.
7
8The file is utf-8 encoded, and each line contains four fields:
9
10 * document name (normalized)
11 * line number in the source document
12 * problematic text
13 * complete line showing the problematic text in context
14
15It is common to find many false positives. To avoid reporting them
16again and again, they may be added to the ``ignored.csv`` file
17(located in the configuration directory). The file has the same
18format as ``suspicious.csv`` with a few differences:
19
20 - each line defines a rule; if the rule matches, the issue
21 is ignored.
22 - line number may be empty (that is, nothing between the
23 commas: ",,"). In this case, line numbers are ignored (the
24 rule matches anywhere in the file).
25 - the last field does not have to be a complete line; some
26 surrounding text (never more than a line) is enough for
27 context.
28
29Rules are processed sequentially. A rule matches when:
30
31 * document names are the same
32 * problematic texts are the same
33 * line numbers are close to each other (5 lines up or down)
34 * the rule text is completely contained into the source line
35
36The simplest way to create the ignored.csv file is by copying
37undesired entries from suspicious.csv (possibly trimming the last
38field.)
39
40Copyright 2009 Gabriel A. Genellina
41
42"""
43
44import os, sys
45import csv
46import re
47from docutils import nodes
Georg Brandlbadbba42009-01-26 23:06:17 +000048
49try:
50 from sphinx.builders import Builder
51except ImportError:
52 from sphinx.builder import Builder
53
Benjamin Peterson9f7ae1b2009-01-09 03:04:01 +000054
55detect_all = re.compile(ur'''
56 ::(?=[^=])| # two :: (but NOT ::=)
57 :[a-zA-Z][a-zA-Z0-9]+| # :foo
58 `| # ` (seldom used by itself)
59 (?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
60 ''', re.UNICODE | re.VERBOSE).finditer
61
62class Rule:
63 def __init__(self, docname, lineno, issue, line):
64 "A rule for ignoring issues"
65 self.docname = docname # document to which this rule applies
66 self.lineno = lineno # line number in the original source;
67 # this rule matches only near that.
68 # None -> don't care
69 self.issue = issue # the markup fragment that triggered this rule
70 self.line = line # text of the container element (single line only)
71
72
73class CheckSuspiciousMarkupBuilder(Builder):
74 """
75 Checks for possibly invalid markup that may leak into the output
76 """
77 name = 'suspicious'
78
79 def init(self):
80 # create output file
81 self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
82 open(self.log_file_name, 'w').close()
83 # load database of previously ignored issues
84 self.load_rules(os.path.join(os.path.dirname(__file__), 'susp-ignored.csv'))
85
86 def get_outdated_docs(self):
87 return self.env.found_docs
88
89 def get_target_uri(self, docname, typ=None):
90 return ''
91
92 def prepare_writing(self, docnames):
93 ### PYTHON PROJECT SPECIFIC ###
94 for name in set(docnames):
95 if name.split('/', 1)[0] == 'documenting':
96 docnames.remove(name)
97 ### PYTHON PROJECT SPECIFIC ###
98
99 def write_doc(self, docname, doctree):
100 self.any_issue = False # set when any issue is encountered in this document
101 self.docname = docname
102 visitor = SuspiciousVisitor(doctree, self)
103 doctree.walk(visitor)
104
105 def finish(self):
106 return
107
108 def check_issue(self, line, lineno, issue):
109 if not self.is_ignored(line, lineno, issue):
110 self.report_issue(line, lineno, issue)
111
112 def is_ignored(self, line, lineno, issue):
113 """Determine whether this issue should be ignored.
114 """
115 docname = self.docname
116 for rule in self.rules:
117 if rule.docname != docname: continue
118 if rule.issue != issue: continue
119 # Both lines must match *exactly*. This is rather strict,
120 # and probably should be improved.
121 # Doing fuzzy matches with levenshtein distance could work,
122 # but that means bringing other libraries...
123 # Ok, relax that requirement: just check if the rule fragment
124 # is contained in the document line
125 if rule.line not in line: continue
126 # Check both line numbers. If they're "near"
127 # this rule matches. (lineno=None means "don't care")
128 if (rule.lineno is not None) and \
129 abs(rule.lineno - lineno) > 5: continue
130 # if it came this far, the rule matched
131 return True
132 return False
133
134 def report_issue(self, text, lineno, issue):
135 if not self.any_issue: self.info()
136 self.any_issue = True
137 self.write_log_entry(lineno, issue, text)
138 self.warn('[%s:%d] "%s" found in "%-.120s"' % (
139 self.docname.encode(sys.getdefaultencoding(),'replace'),
140 lineno,
141 issue.encode(sys.getdefaultencoding(),'replace'),
142 text.strip().encode(sys.getdefaultencoding(),'replace')))
143 self.app.statuscode = 1
144
145 def write_log_entry(self, lineno, issue, text):
146 f = open(self.log_file_name, 'ab')
147 writer = csv.writer(f)
148 writer.writerow([self.docname.encode('utf-8'),
149 lineno,
150 issue.encode('utf-8'),
151 text.strip().encode('utf-8')])
152 del writer
153 f.close()
154
155 def load_rules(self, filename):
156 """Load database of previously ignored issues.
157
158 A csv file, with exactly the same format as suspicious.csv
159 Fields: document name (normalized), line number, issue, surrounding text
160 """
161 self.info("loading ignore rules... ", nonl=1)
162 self.rules = rules = []
163 try: f = open(filename, 'rb')
164 except IOError: return
165 for i, row in enumerate(csv.reader(f)):
166 if len(row) != 4:
167 raise ValueError, "wrong format in %s, line %d: %s" % (filename, i+1, row)
168 docname, lineno, issue, text = row
169 docname = docname.decode('utf-8')
170 if lineno: lineno = int(lineno)
171 else: lineno = None
172 issue = issue.decode('utf-8')
173 text = text.decode('utf-8')
174 rule = Rule(docname, lineno, issue, text)
175 rules.append(rule)
176 f.close()
177 self.info('done, %d rules loaded' % len(self.rules))
178
179
180def get_lineno(node):
181 "Obtain line number information for a node"
182 lineno = None
183 while lineno is None and node:
184 node = node.parent
185 lineno = node.line
186 return lineno
187
188
189def extract_line(text, index):
190 """text may be a multiline string; extract
191 only the line containing the given character index.
192
193 >>> extract_line("abc\ndefgh\ni", 6)
194 >>> 'defgh'
195 >>> for i in (0, 2, 3, 4, 10):
196 ... print extract_line("abc\ndefgh\ni", i)
197 abc
198 abc
199 abc
200 defgh
201 defgh
202 i
203 """
204 p = text.rfind('\n', 0, index) + 1
205 q = text.find('\n', index)
206 if q<0: q = len(text)
207 return text[p:q]
208
209
210class SuspiciousVisitor(nodes.GenericNodeVisitor):
211
212 lastlineno = 0
213
214 def __init__(self, document, builder):
215 nodes.GenericNodeVisitor.__init__(self, document)
216 self.builder = builder
217
218 def default_visit(self, node):
219 if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
220 text = node.astext()
221 # lineno seems to go backwards sometimes (?)
222 self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
223 seen = set() # don't report the same issue more than only once per line
224 for match in detect_all(text):
225 #import pdb; pdb.set_trace()
226 issue = match.group()
227 line = extract_line(text, match.start())
228 if (issue, line) not in seen:
229 self.builder.check_issue(line, lineno, issue)
230 seen.add((issue, line))
231
232 unknown_visit = default_visit
233
234 def visit_document(self, node):
235 self.lastlineno = 0
236
237 def visit_comment(self, node):
238 # ignore comments -- too much false positives.
239 # (although doing this could miss some errors;
240 # there were two sections "commented-out" by mistake
241 # in the Python docs that would not be catched)
242 raise nodes.SkipNode