Blame - Doc/tools/extensions/suspicious.py - platform/external/python/cpython3

blob: 9e814fb94d2b56f596a40055855249b6430bdb5f [file] [log] [blame]

Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	1	"""
				2	Try to detect suspicious constructs, resembling markup
				3	that has leaked into the final output.
				4
				5	Suspicious lines are reported in a comma-separated-file,
				6	``suspicious.csv``, located in the output directory.
				7
				8	The file is utf-8 encoded, and each line contains four fields:
				9
				10	* document name (normalized)
				11	* line number in the source document
				12	* problematic text
				13	* complete line showing the problematic text in context
				14
				15	It is common to find many false positives. To avoid reporting them
				16	again and again, they may be added to the ``ignored.csv`` file
				17	(located in the configuration directory). The file has the same
				18	format as ``suspicious.csv`` with a few differences:
				19
				20	- each line defines a rule; if the rule matches, the issue
				21	is ignored.
				22	- line number may be empty (that is, nothing between the
				23	commas: ",,"). In this case, line numbers are ignored (the
				24	rule matches anywhere in the file).
				25	- the last field does not have to be a complete line; some
				26	surrounding text (never more than a line) is enough for
				27	context.
				28
				29	Rules are processed sequentially. A rule matches when:
				30
				31	* document names are the same
				32	* problematic texts are the same
				33	* line numbers are close to each other (5 lines up or down)
				34	* the rule text is completely contained into the source line
				35
				36	The simplest way to create the ignored.csv file is by copying
				37	undesired entries from suspicious.csv (possibly trimming the last
				38	field.)
				39
				40	Copyright 2009 Gabriel A. Genellina
				41
				42	"""
				43
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	44	import os
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	45	import re
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	46	import csv
				47	import sys
				48
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	49	from docutils import nodes
				50	from sphinx.builders import Builder
Pablo Galindo	ee171a2	2018-10-15 20:07:23 +0100	[diff] [blame]	51	import sphinx.util
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	52
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	53	detect_all = re.compile(r'''
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	54	::(?=[^=])\| # two :: (but NOT ::=)
				55	:[a-zA-Z][a-zA-Z0-9]+\| # :foo
				56	`\| # ` (seldom used by itself)
				57	(?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
				58	''', re.UNICODE \| re.VERBOSE).finditer
				59
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	60	py3 = sys.version_info >= (3, 0)
				61
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	62
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	63	class Rule:
				64	def __init__(self, docname, lineno, issue, line):
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	65	"""A rule for ignoring issues"""
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	66	self.docname = docname # document to which this rule applies
				67	self.lineno = lineno # line number in the original source;
				68	# this rule matches only near that.
				69	# None -> don't care
				70	self.issue = issue # the markup fragment that triggered this rule
				71	self.line = line # text of the container element (single line only)
Ezio Melotti	a5d55ba	2013-03-28 17:40:24 +0200	[diff] [blame]	72	self.used = False
				73
				74	def __repr__(self):
				75	return '{0.docname},,{0.issue},{0.line}'.format(self)
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	76
				77
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	78
				79	class dialect(csv.excel):
				80	"""Our dialect: uses only linefeed as newline."""
				81	lineterminator = '\n'
				82
				83
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	84	class CheckSuspiciousMarkupBuilder(Builder):
				85	"""
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	86	Checks for possibly invalid markup that may leak into the output.
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	87	"""
				88	name = 'suspicious'
Pablo Galindo	ee171a2	2018-10-15 20:07:23 +0100	[diff] [blame]	89	logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder")
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	90
				91	def init(self):
				92	# create output file
				93	self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
				94	open(self.log_file_name, 'w').close()
				95	# load database of previously ignored issues
Georg Brandl	e039029	2014-10-29 08:07:37 +0100	[diff] [blame]	96	self.load_rules(os.path.join(os.path.dirname(__file__), '..',
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	97	'susp-ignored.csv'))
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	98
				99	def get_outdated_docs(self):
				100	return self.env.found_docs
				101
				102	def get_target_uri(self, docname, typ=None):
				103	return ''
				104
				105	def prepare_writing(self, docnames):
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	106	pass
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	107
				108	def write_doc(self, docname, doctree):
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	109	# set when any issue is encountered in this document
				110	self.any_issue = False
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	111	self.docname = docname
				112	visitor = SuspiciousVisitor(doctree, self)
				113	doctree.walk(visitor)
				114
				115	def finish(self):
Ezio Melotti	a5d55ba	2013-03-28 17:40:24 +0200	[diff] [blame]	116	unused_rules = [rule for rule in self.rules if not rule.used]
				117	if unused_rules:
Anthony Sottile	e1786b5	2019-09-02 09:01:23 -0700	[diff] [blame]	118	self.logger.warning(
				119	'Found %s/%s unused rules: %s' % (
				120	len(unused_rules), len(self.rules),
				121	''.join(repr(rule) for rule in unused_rules),
				122	)
				123	)
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	124	return
				125
				126	def check_issue(self, line, lineno, issue):
				127	if not self.is_ignored(line, lineno, issue):
				128	self.report_issue(line, lineno, issue)
				129
				130	def is_ignored(self, line, lineno, issue):
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	131	"""Determine whether this issue should be ignored."""
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	132	docname = self.docname
				133	for rule in self.rules:
				134	if rule.docname != docname: continue
				135	if rule.issue != issue: continue
				136	# Both lines must match exactly. This is rather strict,
				137	# and probably should be improved.
				138	# Doing fuzzy matches with levenshtein distance could work,
				139	# but that means bringing other libraries...
				140	# Ok, relax that requirement: just check if the rule fragment
				141	# is contained in the document line
				142	if rule.line not in line: continue
				143	# Check both line numbers. If they're "near"
				144	# this rule matches. (lineno=None means "don't care")
				145	if (rule.lineno is not None) and \
				146	abs(rule.lineno - lineno) > 5: continue
				147	# if it came this far, the rule matched
Ezio Melotti	a5d55ba	2013-03-28 17:40:24 +0200	[diff] [blame]	148	rule.used = True
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	149	return True
				150	return False
				151
				152	def report_issue(self, text, lineno, issue):
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	153	self.any_issue = True
				154	self.write_log_entry(lineno, issue, text)
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	155	if py3:
Anthony Sottile	e1786b5	2019-09-02 09:01:23 -0700	[diff] [blame]	156	self.logger.warning('[%s:%d] "%s" found in "%-.120s"' %
				157	(self.docname, lineno, issue, text))
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	158	else:
Anthony Sottile	e1786b5	2019-09-02 09:01:23 -0700	[diff] [blame]	159	self.logger.warning(
				160	'[%s:%d] "%s" found in "%-.120s"' % (
				161	self.docname.encode(sys.getdefaultencoding(),'replace'),
				162	lineno,
				163	issue.encode(sys.getdefaultencoding(),'replace'),
				164	text.strip().encode(sys.getdefaultencoding(),'replace')))
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	165	self.app.statuscode = 1
				166
				167	def write_log_entry(self, lineno, issue, text):
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	168	if py3:
				169	f = open(self.log_file_name, 'a')
				170	writer = csv.writer(f, dialect)
				171	writer.writerow([self.docname, lineno, issue, text.strip()])
				172	f.close()
				173	else:
				174	f = open(self.log_file_name, 'ab')
				175	writer = csv.writer(f, dialect)
				176	writer.writerow([self.docname.encode('utf-8'),
				177	lineno,
				178	issue.encode('utf-8'),
				179	text.strip().encode('utf-8')])
				180	f.close()
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	181
				182	def load_rules(self, filename):
				183	"""Load database of previously ignored issues.
				184
				185	A csv file, with exactly the same format as suspicious.csv
				186	Fields: document name (normalized), line number, issue, surrounding text
				187	"""
Pablo Galindo	ee171a2	2018-10-15 20:07:23 +0100	[diff] [blame]	188	self.logger.info("loading ignore rules... ", nonl=1)
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	189	self.rules = rules = []
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	190	try:
				191	if py3:
				192	f = open(filename, 'r')
				193	else:
				194	f = open(filename, 'rb')
				195	except IOError:
				196	return
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	197	for i, row in enumerate(csv.reader(f)):
				198	if len(row) != 4:
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	199	raise ValueError(
				200	"wrong format in %s, line %d: %s" % (filename, i+1, row))
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	201	docname, lineno, issue, text = row
Georg Brandl	a17fd1f	2010-10-29 05:30:17 +0000	[diff] [blame]	202	if lineno:
				203	lineno = int(lineno)
				204	else:
				205	lineno = None
				206	if not py3:
				207	docname = docname.decode('utf-8')
				208	issue = issue.decode('utf-8')
				209	text = text.decode('utf-8')
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	210	rule = Rule(docname, lineno, issue, text)
				211	rules.append(rule)
				212	f.close()
Pablo Galindo	ee171a2	2018-10-15 20:07:23 +0100	[diff] [blame]	213	self.logger.info('done, %d rules loaded' % len(self.rules))
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	214
				215
				216	def get_lineno(node):
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	217	"""Obtain line number information for a node."""
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	218	lineno = None
				219	while lineno is None and node:
				220	node = node.parent
				221	lineno = node.line
				222	return lineno
				223
				224
				225	def extract_line(text, index):
				226	"""text may be a multiline string; extract
				227	only the line containing the given character index.
				228
				229	>>> extract_line("abc\ndefgh\ni", 6)
				230	>>> 'defgh'
				231	>>> for i in (0, 2, 3, 4, 10):
				232	... print extract_line("abc\ndefgh\ni", i)
				233	abc
				234	abc
				235	abc
				236	defgh
				237	defgh
				238	i
				239	"""
				240	p = text.rfind('\n', 0, index) + 1
				241	q = text.find('\n', index)
Georg Brandl	19b3e00	2010-10-06 10:35:24 +0000	[diff] [blame]	242	if q < 0:
				243	q = len(text)
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	244	return text[p:q]
				245
				246
				247	class SuspiciousVisitor(nodes.GenericNodeVisitor):
				248
				249	lastlineno = 0
				250
				251	def __init__(self, document, builder):
				252	nodes.GenericNodeVisitor.__init__(self, document)
				253	self.builder = builder
				254
				255	def default_visit(self, node):
				256	if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
				257	text = node.astext()
				258	# lineno seems to go backwards sometimes (?)
				259	self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
				260	seen = set() # don't report the same issue more than only once per line
				261	for match in detect_all(text):
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	262	issue = match.group()
				263	line = extract_line(text, match.start())
				264	if (issue, line) not in seen:
				265	self.builder.check_issue(line, lineno, issue)
				266	seen.add((issue, line))
				267
				268	unknown_visit = default_visit
				269
				270	def visit_document(self, node):
				271	self.lastlineno = 0
				272
				273	def visit_comment(self, node):
				274	# ignore comments -- too much false positives.
				275	# (although doing this could miss some errors;
				276	# there were two sections "commented-out" by mistake
Martin Panter	2275e62	2016-06-20 07:52:50 +0000	[diff] [blame]	277	# in the Python docs that would not be caught)
Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	278	raise nodes.SkipNode