Blame - Doc/tools/sphinxext/suspicious.py - platform/external/python/cpython3

blob: 888b2318b32e1b87a67e1f8aa2f87d8ca620f515 [file] [log] [blame]

Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	1	"""
				2	Try to detect suspicious constructs, resembling markup
				3	that has leaked into the final output.
				4
				5	Suspicious lines are reported in a comma-separated-file,
				6	``suspicious.csv``, located in the output directory.
				7
				8	The file is utf-8 encoded, and each line contains four fields:
				9
				10	* document name (normalized)
				11	* line number in the source document
				12	* problematic text
				13	* complete line showing the problematic text in context
				14
				15	It is common to find many false positives. To avoid reporting them
				16	again and again, they may be added to the ``ignored.csv`` file
				17	(located in the configuration directory). The file has the same
				18	format as ``suspicious.csv`` with a few differences:
				19
				20	- each line defines a rule; if the rule matches, the issue
				21	is ignored.
				22	- line number may be empty (that is, nothing between the
				23	commas: ",,"). In this case, line numbers are ignored (the
				24	rule matches anywhere in the file).
				25	- the last field does not have to be a complete line; some
				26	surrounding text (never more than a line) is enough for
				27	context.
				28
				29	Rules are processed sequentially. A rule matches when:
				30
				31	* document names are the same
				32	* problematic texts are the same
				33	* line numbers are close to each other (5 lines up or down)
				34	* the rule text is completely contained into the source line
				35
				36	The simplest way to create the ignored.csv file is by copying
				37	undesired entries from suspicious.csv (possibly trimming the last
				38	field.)
				39
				40	Copyright 2009 Gabriel A. Genellina
				41
				42	"""
				43
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	44	import os
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	45	import re
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	46	import csv
				47	import sys
				48
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	49	from docutils import nodes
				50	from sphinx.builders import Builder
				51
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	52	detect_all = re.compile(r'''
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	53	::(?=[^=])\| # two :: (but NOT ::=)
				54	:[a-zA-Z][a-zA-Z0-9]+\| # :foo
				55	`\| # ` (seldom used by itself)
				56	(?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
				57	''', re.UNICODE \| re.VERBOSE).finditer
				58
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	59	py3 = sys.version_info >= (3, 0)
				60
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	61
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	62	class Rule:
				63	def __init__(self, docname, lineno, issue, line):
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	64	"""A rule for ignoring issues"""
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	65	self.docname = docname # document to which this rule applies
				66	self.lineno = lineno # line number in the original source;
				67	# this rule matches only near that.
				68	# None -> don't care
				69	self.issue = issue # the markup fragment that triggered this rule
				70	self.line = line # text of the container element (single line only)
				71
				72
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	73
				74	class dialect(csv.excel):
				75	"""Our dialect: uses only linefeed as newline."""
				76	lineterminator = '\n'
				77
				78
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	79	class CheckSuspiciousMarkupBuilder(Builder):
				80	"""
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	81	Checks for possibly invalid markup that may leak into the output.
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	82	"""
				83	name = 'suspicious'
				84
				85	def init(self):
				86	# create output file
				87	self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
				88	open(self.log_file_name, 'w').close()
				89	# load database of previously ignored issues
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	90	self.load_rules(os.path.join(os.path.dirname(__file__),
				91	'susp-ignored.csv'))
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	92
				93	def get_outdated_docs(self):
				94	return self.env.found_docs
				95
				96	def get_target_uri(self, docname, typ=None):
				97	return ''
				98
				99	def prepare_writing(self, docnames):
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	100	pass
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	101
				102	def write_doc(self, docname, doctree):
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	103	# set when any issue is encountered in this document
				104	self.any_issue = False
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	105	self.docname = docname
				106	visitor = SuspiciousVisitor(doctree, self)
				107	doctree.walk(visitor)
				108
				109	def finish(self):
				110	return
				111
				112	def check_issue(self, line, lineno, issue):
				113	if not self.is_ignored(line, lineno, issue):
				114	self.report_issue(line, lineno, issue)
				115
				116	def is_ignored(self, line, lineno, issue):
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	117	"""Determine whether this issue should be ignored."""
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	118	docname = self.docname
				119	for rule in self.rules:
				120	if rule.docname != docname: continue
				121	if rule.issue != issue: continue
				122	# Both lines must match exactly. This is rather strict,
				123	# and probably should be improved.
				124	# Doing fuzzy matches with levenshtein distance could work,
				125	# but that means bringing other libraries...
				126	# Ok, relax that requirement: just check if the rule fragment
				127	# is contained in the document line
				128	if rule.line not in line: continue
				129	# Check both line numbers. If they're "near"
				130	# this rule matches. (lineno=None means "don't care")
				131	if (rule.lineno is not None) and \
				132	abs(rule.lineno - lineno) > 5: continue
				133	# if it came this far, the rule matched
				134	return True
				135	return False
				136
				137	def report_issue(self, text, lineno, issue):
				138	if not self.any_issue: self.info()
				139	self.any_issue = True
				140	self.write_log_entry(lineno, issue, text)
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	141	if py3:
				142	self.warn('[%s:%d] "%s" found in "%-.120s"' %
				143	(self.docname, lineno, issue, text))
				144	else:
				145	self.warn('[%s:%d] "%s" found in "%-.120s"' % (
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	146	self.docname.encode(sys.getdefaultencoding(),'replace'),
				147	lineno,
				148	issue.encode(sys.getdefaultencoding(),'replace'),
				149	text.strip().encode(sys.getdefaultencoding(),'replace')))
				150	self.app.statuscode = 1
				151
				152	def write_log_entry(self, lineno, issue, text):
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	153	if py3:
				154	f = open(self.log_file_name, 'a')
				155	writer = csv.writer(f, dialect)
				156	writer.writerow([self.docname, lineno, issue, text.strip()])
				157	f.close()
				158	else:
				159	f = open(self.log_file_name, 'ab')
				160	writer = csv.writer(f, dialect)
				161	writer.writerow([self.docname.encode('utf-8'),
				162	lineno,
				163	issue.encode('utf-8'),
				164	text.strip().encode('utf-8')])
				165	f.close()
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	166
				167	def load_rules(self, filename):
				168	"""Load database of previously ignored issues.
				169
				170	A csv file, with exactly the same format as suspicious.csv
				171	Fields: document name (normalized), line number, issue, surrounding text
				172	"""
				173	self.info("loading ignore rules... ", nonl=1)
				174	self.rules = rules = []
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	175	try:
				176	if py3:
				177	f = open(filename, 'r')
				178	else:
				179	f = open(filename, 'rb')
				180	except IOError:
				181	return
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	182	for i, row in enumerate(csv.reader(f)):
				183	if len(row) != 4:
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	184	raise ValueError(
				185	"wrong format in %s, line %d: %s" % (filename, i+1, row))
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	186	docname, lineno, issue, text = row
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	187	if lineno:
				188	lineno = int(lineno)
				189	else:
				190	lineno = None
				191	if not py3:
				192	docname = docname.decode('utf-8')
				193	issue = issue.decode('utf-8')
				194	text = text.decode('utf-8')
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	195	rule = Rule(docname, lineno, issue, text)
				196	rules.append(rule)
				197	f.close()
				198	self.info('done, %d rules loaded' % len(self.rules))
				199
				200
				201	def get_lineno(node):
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	202	"""Obtain line number information for a node."""
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	203	lineno = None
				204	while lineno is None and node:
				205	node = node.parent
				206	lineno = node.line
				207	return lineno
				208
				209
				210	def extract_line(text, index):
				211	"""text may be a multiline string; extract
				212	only the line containing the given character index.
				213
				214	>>> extract_line("abc\ndefgh\ni", 6)
				215	>>> 'defgh'
				216	>>> for i in (0, 2, 3, 4, 10):
				217	... print extract_line("abc\ndefgh\ni", i)
				218	abc
				219	abc
				220	abc
				221	defgh
				222	defgh
				223	i
				224	"""
				225	p = text.rfind('\n', 0, index) + 1
				226	q = text.find('\n', index)
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	227	if q < 0:
				228	q = len(text)
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	229	return text[p:q]
				230
				231
				232	class SuspiciousVisitor(nodes.GenericNodeVisitor):
				233
				234	lastlineno = 0
				235
				236	def __init__(self, document, builder):
				237	nodes.GenericNodeVisitor.__init__(self, document)
				238	self.builder = builder
				239
				240	def default_visit(self, node):
				241	if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
				242	text = node.astext()
				243	# lineno seems to go backwards sometimes (?)
				244	self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
				245	seen = set() # don't report the same issue more than only once per line
				246	for match in detect_all(text):
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	247	issue = match.group()
				248	line = extract_line(text, match.start())
				249	if (issue, line) not in seen:
				250	self.builder.check_issue(line, lineno, issue)
				251	seen.add((issue, line))
				252
				253	unknown_visit = default_visit
				254
				255	def visit_document(self, node):
				256	self.lastlineno = 0
				257
				258	def visit_comment(self, node):
				259	# ignore comments -- too much false positives.
				260	# (although doing this could miss some errors;
				261	# there were two sections "commented-out" by mistake
				262	# in the Python docs that would not be catched)
				263	raise nodes.SkipNode