Blame - Doc/tools/sphinxext/suspicious.py - platform/external/python/cpython3

blob: ae11793ea54830b805dead5dabb8d1fc03fe35d9 [file] [log] [blame]

Benjamin Peterson	28d88b4	2009-01-09 03:03:23 +0000	[diff] [blame]	1	"""
				2	Try to detect suspicious constructs, resembling markup
				3	that has leaked into the final output.
				4
				5	Suspicious lines are reported in a comma-separated-file,
				6	``suspicious.csv``, located in the output directory.
				7
				8	The file is utf-8 encoded, and each line contains four fields:
				9
				10	* document name (normalized)
				11	* line number in the source document
				12	* problematic text
				13	* complete line showing the problematic text in context
				14
				15	It is common to find many false positives. To avoid reporting them
				16	again and again, they may be added to the ``ignored.csv`` file
				17	(located in the configuration directory). The file has the same
				18	format as ``suspicious.csv`` with a few differences:
				19
				20	- each line defines a rule; if the rule matches, the issue
				21	is ignored.
				22	- line number may be empty (that is, nothing between the
				23	commas: ",,"). In this case, line numbers are ignored (the
				24	rule matches anywhere in the file).
				25	- the last field does not have to be a complete line; some
				26	surrounding text (never more than a line) is enough for
				27	context.
				28
				29	Rules are processed sequentially. A rule matches when:
				30
				31	* document names are the same
				32	* problematic texts are the same
				33	* line numbers are close to each other (5 lines up or down)
				34	* the rule text is completely contained into the source line
				35
				36	The simplest way to create the ignored.csv file is by copying
				37	undesired entries from suspicious.csv (possibly trimming the last
				38	field.)
				39
				40	Copyright 2009 Gabriel A. Genellina
				41
				42	"""
				43
				44	import os, sys
				45	import csv
				46	import re
				47	from docutils import nodes
				48	from sphinx.builders import Builder
				49
				50	detect_all = re.compile(ur'''
				51	::(?=[^=])\| # two :: (but NOT ::=)
				52	:[a-zA-Z][a-zA-Z0-9]+\| # :foo
				53	`\| # ` (seldom used by itself)
				54	(?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
				55	''', re.UNICODE \| re.VERBOSE).finditer
				56
				57	class Rule:
				58	def __init__(self, docname, lineno, issue, line):
				59	"A rule for ignoring issues"
				60	self.docname = docname # document to which this rule applies
				61	self.lineno = lineno # line number in the original source;
				62	# this rule matches only near that.
				63	# None -> don't care
				64	self.issue = issue # the markup fragment that triggered this rule
				65	self.line = line # text of the container element (single line only)
				66
				67
				68	class CheckSuspiciousMarkupBuilder(Builder):
				69	"""
				70	Checks for possibly invalid markup that may leak into the output
				71	"""
				72	name = 'suspicious'
				73
				74	def init(self):
				75	# create output file
				76	self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
				77	open(self.log_file_name, 'w').close()
				78	# load database of previously ignored issues
				79	self.load_rules(os.path.join(os.path.dirname(__file__), 'susp-ignored.csv'))
				80
				81	def get_outdated_docs(self):
				82	return self.env.found_docs
				83
				84	def get_target_uri(self, docname, typ=None):
				85	return ''
				86
				87	def prepare_writing(self, docnames):
				88	### PYTHON PROJECT SPECIFIC ###
				89	for name in set(docnames):
				90	if name.split('/', 1)[0] == 'documenting':
				91	docnames.remove(name)
				92	### PYTHON PROJECT SPECIFIC ###
				93
				94	def write_doc(self, docname, doctree):
				95	self.any_issue = False # set when any issue is encountered in this document
				96	self.docname = docname
				97	visitor = SuspiciousVisitor(doctree, self)
				98	doctree.walk(visitor)
				99
				100	def finish(self):
				101	return
				102
				103	def check_issue(self, line, lineno, issue):
				104	if not self.is_ignored(line, lineno, issue):
				105	self.report_issue(line, lineno, issue)
				106
				107	def is_ignored(self, line, lineno, issue):
				108	"""Determine whether this issue should be ignored.
				109	"""
				110	docname = self.docname
				111	for rule in self.rules:
				112	if rule.docname != docname: continue
				113	if rule.issue != issue: continue
				114	# Both lines must match exactly. This is rather strict,
				115	# and probably should be improved.
				116	# Doing fuzzy matches with levenshtein distance could work,
				117	# but that means bringing other libraries...
				118	# Ok, relax that requirement: just check if the rule fragment
				119	# is contained in the document line
				120	if rule.line not in line: continue
				121	# Check both line numbers. If they're "near"
				122	# this rule matches. (lineno=None means "don't care")
				123	if (rule.lineno is not None) and \
				124	abs(rule.lineno - lineno) > 5: continue
				125	# if it came this far, the rule matched
				126	return True
				127	return False
				128
				129	def report_issue(self, text, lineno, issue):
				130	if not self.any_issue: self.info()
				131	self.any_issue = True
				132	self.write_log_entry(lineno, issue, text)
				133	self.warn('[%s:%d] "%s" found in "%-.120s"' % (
				134	self.docname.encode(sys.getdefaultencoding(),'replace'),
				135	lineno,
				136	issue.encode(sys.getdefaultencoding(),'replace'),
				137	text.strip().encode(sys.getdefaultencoding(),'replace')))
				138	self.app.statuscode = 1
				139
				140	def write_log_entry(self, lineno, issue, text):
				141	f = open(self.log_file_name, 'ab')
				142	writer = csv.writer(f)
				143	writer.writerow([self.docname.encode('utf-8'),
				144	lineno,
				145	issue.encode('utf-8'),
				146	text.strip().encode('utf-8')])
				147	del writer
				148	f.close()
				149
				150	def load_rules(self, filename):
				151	"""Load database of previously ignored issues.
				152
				153	A csv file, with exactly the same format as suspicious.csv
				154	Fields: document name (normalized), line number, issue, surrounding text
				155	"""
				156	self.info("loading ignore rules... ", nonl=1)
				157	self.rules = rules = []
				158	try: f = open(filename, 'rb')
				159	except IOError: return
				160	for i, row in enumerate(csv.reader(f)):
				161	if len(row) != 4:
				162	raise ValueError, "wrong format in %s, line %d: %s" % (filename, i+1, row)
				163	docname, lineno, issue, text = row
				164	docname = docname.decode('utf-8')
				165	if lineno: lineno = int(lineno)
				166	else: lineno = None
				167	issue = issue.decode('utf-8')
				168	text = text.decode('utf-8')
				169	rule = Rule(docname, lineno, issue, text)
				170	rules.append(rule)
				171	f.close()
				172	self.info('done, %d rules loaded' % len(self.rules))
				173
				174
				175	def get_lineno(node):
				176	"Obtain line number information for a node"
				177	lineno = None
				178	while lineno is None and node:
				179	node = node.parent
				180	lineno = node.line
				181	return lineno
				182
				183
				184	def extract_line(text, index):
				185	"""text may be a multiline string; extract
				186	only the line containing the given character index.
				187
				188	>>> extract_line("abc\ndefgh\ni", 6)
				189	>>> 'defgh'
				190	>>> for i in (0, 2, 3, 4, 10):
				191	... print extract_line("abc\ndefgh\ni", i)
				192	abc
				193	abc
				194	abc
				195	defgh
				196	defgh
				197	i
				198	"""
				199	p = text.rfind('\n', 0, index) + 1
				200	q = text.find('\n', index)
				201	if q<0: q = len(text)
				202	return text[p:q]
				203
				204
				205	class SuspiciousVisitor(nodes.GenericNodeVisitor):
				206
				207	lastlineno = 0
				208
				209	def __init__(self, document, builder):
				210	nodes.GenericNodeVisitor.__init__(self, document)
				211	self.builder = builder
				212
				213	def default_visit(self, node):
				214	if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
				215	text = node.astext()
				216	# lineno seems to go backwards sometimes (?)
				217	self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
				218	seen = set() # don't report the same issue more than only once per line
				219	for match in detect_all(text):
				220	#import pdb; pdb.set_trace()
				221	issue = match.group()
				222	line = extract_line(text, match.start())
				223	if (issue, line) not in seen:
				224	self.builder.check_issue(line, lineno, issue)
				225	seen.add((issue, line))
				226
				227	unknown_visit = default_visit
				228
				229	def visit_document(self, node):
				230	self.lastlineno = 0
				231
				232	def visit_comment(self, node):
				233	# ignore comments -- too much false positives.
				234	# (although doing this could miss some errors;
				235	# there were two sections "commented-out" by mistake
				236	# in the Python docs that would not be catched)
				237	raise nodes.SkipNode