Blame - Doc/tools/suspicious.py - platform/external/python/cpython2

blob: e3975602fca7d21cc1286af3b1025357ba8891b1 [file] [log] [blame]

Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	1	"""
				2	Try to detect suspicious constructs, resembling markup
				3	that has leaked into the final output.
				4
				5	Suspicious lines are reported in a comma-separated-file,
				6	``suspicious.csv``, located in the output directory.
				7
				8	The file is utf-8 encoded, and each line contains four fields:
				9
				10	* document name (normalized)
				11	* line number in the source document
				12	* problematic text
				13	* complete line showing the problematic text in context
				14
				15	It is common to find many false positives. To avoid reporting them
				16	again and again, they may be added to the ``ignored.csv`` file
				17	(located in the configuration directory). The file has the same
				18	format as ``suspicious.csv`` with a few differences:
				19
				20	- each line defines a rule; if the rule matches, the issue
				21	is ignored.
				22	- line number may be empty (that is, nothing between the
				23	commas: ",,"). In this case, line numbers are ignored (the
				24	rule matches anywhere in the file).
				25	- the last field does not have to be a complete line; some
				26	surrounding text (never more than a line) is enough for
				27	context.
				28
				29	Rules are processed sequentially. A rule matches when:
				30
				31	* document names are the same
				32	* problematic texts are the same
				33	* line numbers are close to each other (5 lines up or down)
				34	* the rule text is completely contained into the source line
				35
				36	The simplest way to create the ignored.csv file is by copying
				37	undesired entries from suspicious.csv (possibly trimming the last
				38	field.)
				39
				40	Copyright 2009 Gabriel A. Genellina
				41
				42	"""
				43
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	44	import os
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	45	import re
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	46	import csv
				47	import sys
				48
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	49	from docutils import nodes
Benjamin Peterson	1a67f58	2009-01-08 04:01:00 +0000	[diff] [blame]	50	from sphinx.builders import Builder
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	51
				52	detect_all = re.compile(ur'''
				53	::(?=[^=])\| # two :: (but NOT ::=)
				54	:[a-zA-Z][a-zA-Z0-9]+\| # :foo
				55	`\| # ` (seldom used by itself)
				56	(?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
				57	''', re.UNICODE \| re.VERBOSE).finditer
				58
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	59
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	60	class Rule:
				61	def __init__(self, docname, lineno, issue, line):
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	62	"""A rule for ignoring issues"""
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	63	self.docname = docname # document to which this rule applies
				64	self.lineno = lineno # line number in the original source;
				65	# this rule matches only near that.
				66	# None -> don't care
				67	self.issue = issue # the markup fragment that triggered this rule
				68	self.line = line # text of the container element (single line only)
Ezio Melotti	144c269	2013-03-28 18:01:11 +0200	[diff] [blame]	69	self.used = False
				70
				71	def __repr__(self):
				72	return '{0.docname},,{0.issue},{0.line}'.format(self)
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	73
				74
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	75
				76	class dialect(csv.excel):
				77	"""Our dialect: uses only linefeed as newline."""
				78	lineterminator = '\n'
				79
				80
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	81	class CheckSuspiciousMarkupBuilder(Builder):
				82	"""
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	83	Checks for possibly invalid markup that may leak into the output.
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	84	"""
				85	name = 'suspicious'
				86
				87	def init(self):
				88	# create output file
				89	self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
				90	open(self.log_file_name, 'w').close()
				91	# load database of previously ignored issues
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	92	self.load_rules(os.path.join(os.path.dirname(__file__),
				93	'susp-ignored.csv'))
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	94
				95	def get_outdated_docs(self):
				96	return self.env.found_docs
				97
				98	def get_target_uri(self, docname, typ=None):
				99	return ''
				100
				101	def prepare_writing(self, docnames):
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	102	pass
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	103
				104	def write_doc(self, docname, doctree):
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	105	# set when any issue is encountered in this document
				106	self.any_issue = False
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	107	self.docname = docname
				108	visitor = SuspiciousVisitor(doctree, self)
				109	doctree.walk(visitor)
				110
				111	def finish(self):
Ezio Melotti	144c269	2013-03-28 18:01:11 +0200	[diff] [blame]	112	unused_rules = [rule for rule in self.rules if not rule.used]
				113	if unused_rules:
				114	self.warn('Found %s/%s unused rules:' %
				115	(len(unused_rules), len(self.rules)))
				116	for rule in unused_rules:
				117	self.info(repr(rule))
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	118	return
				119
				120	def check_issue(self, line, lineno, issue):
				121	if not self.is_ignored(line, lineno, issue):
				122	self.report_issue(line, lineno, issue)
				123
				124	def is_ignored(self, line, lineno, issue):
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	125	"""Determine whether this issue should be ignored."""
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	126	docname = self.docname
				127	for rule in self.rules:
				128	if rule.docname != docname: continue
				129	if rule.issue != issue: continue
				130	# Both lines must match exactly. This is rather strict,
				131	# and probably should be improved.
				132	# Doing fuzzy matches with levenshtein distance could work,
				133	# but that means bringing other libraries...
				134	# Ok, relax that requirement: just check if the rule fragment
				135	# is contained in the document line
				136	if rule.line not in line: continue
				137	# Check both line numbers. If they're "near"
				138	# this rule matches. (lineno=None means "don't care")
				139	if (rule.lineno is not None) and \
				140	abs(rule.lineno - lineno) > 5: continue
				141	# if it came this far, the rule matched
Ezio Melotti	144c269	2013-03-28 18:01:11 +0200	[diff] [blame]	142	rule.used = True
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	143	return True
				144	return False
				145
				146	def report_issue(self, text, lineno, issue):
				147	if not self.any_issue: self.info()
				148	self.any_issue = True
				149	self.write_log_entry(lineno, issue, text)
				150	self.warn('[%s:%d] "%s" found in "%-.120s"' % (
				151	self.docname.encode(sys.getdefaultencoding(),'replace'),
				152	lineno,
				153	issue.encode(sys.getdefaultencoding(),'replace'),
				154	text.strip().encode(sys.getdefaultencoding(),'replace')))
				155	self.app.statuscode = 1
				156
				157	def write_log_entry(self, lineno, issue, text):
				158	f = open(self.log_file_name, 'ab')
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	159	writer = csv.writer(f, dialect)
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	160	writer.writerow([self.docname.encode('utf-8'),
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	161	lineno,
				162	issue.encode('utf-8'),
				163	text.strip().encode('utf-8')])
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	164	f.close()
				165
				166	def load_rules(self, filename):
				167	"""Load database of previously ignored issues.
				168
				169	A csv file, with exactly the same format as suspicious.csv
				170	Fields: document name (normalized), line number, issue, surrounding text
				171	"""
				172	self.info("loading ignore rules... ", nonl=1)
				173	self.rules = rules = []
				174	try: f = open(filename, 'rb')
				175	except IOError: return
				176	for i, row in enumerate(csv.reader(f)):
				177	if len(row) != 4:
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	178	raise ValueError(
				179	"wrong format in %s, line %d: %s" % (filename, i+1, row))
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	180	docname, lineno, issue, text = row
				181	docname = docname.decode('utf-8')
				182	if lineno: lineno = int(lineno)
				183	else: lineno = None
				184	issue = issue.decode('utf-8')
				185	text = text.decode('utf-8')
				186	rule = Rule(docname, lineno, issue, text)
				187	rules.append(rule)
				188	f.close()
				189	self.info('done, %d rules loaded' % len(self.rules))
				190
				191
				192	def get_lineno(node):
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	193	"""Obtain line number information for a node."""
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	194	lineno = None
				195	while lineno is None and node:
				196	node = node.parent
				197	lineno = node.line
				198	return lineno
				199
				200
				201	def extract_line(text, index):
				202	"""text may be a multiline string; extract
				203	only the line containing the given character index.
				204
				205	>>> extract_line("abc\ndefgh\ni", 6)
				206	>>> 'defgh'
				207	>>> for i in (0, 2, 3, 4, 10):
				208	... print extract_line("abc\ndefgh\ni", i)
				209	abc
				210	abc
				211	abc
				212	defgh
				213	defgh
				214	i
				215	"""
				216	p = text.rfind('\n', 0, index) + 1
				217	q = text.find('\n', index)
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	218	if q < 0:
				219	q = len(text)
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	220	return text[p:q]
				221
				222
				223	class SuspiciousVisitor(nodes.GenericNodeVisitor):
				224
				225	lastlineno = 0
				226
				227	def __init__(self, document, builder):
				228	nodes.GenericNodeVisitor.__init__(self, document)
				229	self.builder = builder
				230
				231	def default_visit(self, node):
				232	if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
				233	text = node.astext()
				234	# lineno seems to go backwards sometimes (?)
				235	self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
				236	seen = set() # don't report the same issue more than only once per line
				237	for match in detect_all(text):
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	238	issue = match.group()
				239	line = extract_line(text, match.start())
				240	if (issue, line) not in seen:
				241	self.builder.check_issue(line, lineno, issue)
				242	seen.add((issue, line))
				243
				244	unknown_visit = default_visit
				245
				246	def visit_document(self, node):
				247	self.lastlineno = 0
				248
				249	def visit_comment(self, node):
				250	# ignore comments -- too much false positives.
				251	# (although doing this could miss some errors;
				252	# there were two sections "commented-out" by mistake
				253	# in the Python docs that would not be catched)
				254	raise nodes.SkipNode