Blame - Doc/tools/sphinxext/suspicious.py - platform/external/python/cpython2

blob: f15e931b90acd1740cd93e695110300ea5089c6f [file] [log] [blame]

Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	1	"""
				2	Try to detect suspicious constructs, resembling markup
				3	that has leaked into the final output.
				4
				5	Suspicious lines are reported in a comma-separated-file,
				6	``suspicious.csv``, located in the output directory.
				7
				8	The file is utf-8 encoded, and each line contains four fields:
				9
				10	* document name (normalized)
				11	* line number in the source document
				12	* problematic text
				13	* complete line showing the problematic text in context
				14
				15	It is common to find many false positives. To avoid reporting them
				16	again and again, they may be added to the ``ignored.csv`` file
				17	(located in the configuration directory). The file has the same
				18	format as ``suspicious.csv`` with a few differences:
				19
				20	- each line defines a rule; if the rule matches, the issue
				21	is ignored.
				22	- line number may be empty (that is, nothing between the
				23	commas: ",,"). In this case, line numbers are ignored (the
				24	rule matches anywhere in the file).
				25	- the last field does not have to be a complete line; some
				26	surrounding text (never more than a line) is enough for
				27	context.
				28
				29	Rules are processed sequentially. A rule matches when:
				30
				31	* document names are the same
				32	* problematic texts are the same
				33	* line numbers are close to each other (5 lines up or down)
				34	* the rule text is completely contained into the source line
				35
				36	The simplest way to create the ignored.csv file is by copying
				37	undesired entries from suspicious.csv (possibly trimming the last
				38	field.)
				39
				40	Copyright 2009 Gabriel A. Genellina
				41
				42	"""
				43
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	44	import os
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	45	import re
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	46	import csv
				47	import sys
				48
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	49	from docutils import nodes
Benjamin Peterson	1a67f58	2009-01-08 04:01:00 +0000	[diff] [blame]	50	from sphinx.builders import Builder
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	51
				52	detect_all = re.compile(ur'''
				53	::(?=[^=])\| # two :: (but NOT ::=)
				54	:[a-zA-Z][a-zA-Z0-9]+\| # :foo
				55	`\| # ` (seldom used by itself)
				56	(?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
				57	''', re.UNICODE \| re.VERBOSE).finditer
				58
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	59
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	60	class Rule:
				61	def __init__(self, docname, lineno, issue, line):
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	62	"""A rule for ignoring issues"""
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	63	self.docname = docname # document to which this rule applies
				64	self.lineno = lineno # line number in the original source;
				65	# this rule matches only near that.
				66	# None -> don't care
				67	self.issue = issue # the markup fragment that triggered this rule
				68	self.line = line # text of the container element (single line only)
				69
				70
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	71
				72	class dialect(csv.excel):
				73	"""Our dialect: uses only linefeed as newline."""
				74	lineterminator = '\n'
				75
				76
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	77	class CheckSuspiciousMarkupBuilder(Builder):
				78	"""
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	79	Checks for possibly invalid markup that may leak into the output.
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	80	"""
				81	name = 'suspicious'
				82
				83	def init(self):
				84	# create output file
				85	self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
				86	open(self.log_file_name, 'w').close()
				87	# load database of previously ignored issues
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	88	self.load_rules(os.path.join(os.path.dirname(__file__),
				89	'susp-ignored.csv'))
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	90
				91	def get_outdated_docs(self):
				92	return self.env.found_docs
				93
				94	def get_target_uri(self, docname, typ=None):
				95	return ''
				96
				97	def prepare_writing(self, docnames):
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	98	pass
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	99
				100	def write_doc(self, docname, doctree):
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	101	# set when any issue is encountered in this document
				102	self.any_issue = False
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	103	self.docname = docname
				104	visitor = SuspiciousVisitor(doctree, self)
				105	doctree.walk(visitor)
				106
				107	def finish(self):
				108	return
				109
				110	def check_issue(self, line, lineno, issue):
				111	if not self.is_ignored(line, lineno, issue):
				112	self.report_issue(line, lineno, issue)
				113
				114	def is_ignored(self, line, lineno, issue):
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	115	"""Determine whether this issue should be ignored."""
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	116	docname = self.docname
				117	for rule in self.rules:
				118	if rule.docname != docname: continue
				119	if rule.issue != issue: continue
				120	# Both lines must match exactly. This is rather strict,
				121	# and probably should be improved.
				122	# Doing fuzzy matches with levenshtein distance could work,
				123	# but that means bringing other libraries...
				124	# Ok, relax that requirement: just check if the rule fragment
				125	# is contained in the document line
				126	if rule.line not in line: continue
				127	# Check both line numbers. If they're "near"
				128	# this rule matches. (lineno=None means "don't care")
				129	if (rule.lineno is not None) and \
				130	abs(rule.lineno - lineno) > 5: continue
				131	# if it came this far, the rule matched
				132	return True
				133	return False
				134
				135	def report_issue(self, text, lineno, issue):
				136	if not self.any_issue: self.info()
				137	self.any_issue = True
				138	self.write_log_entry(lineno, issue, text)
				139	self.warn('[%s:%d] "%s" found in "%-.120s"' % (
				140	self.docname.encode(sys.getdefaultencoding(),'replace'),
				141	lineno,
				142	issue.encode(sys.getdefaultencoding(),'replace'),
				143	text.strip().encode(sys.getdefaultencoding(),'replace')))
				144	self.app.statuscode = 1
				145
				146	def write_log_entry(self, lineno, issue, text):
				147	f = open(self.log_file_name, 'ab')
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	148	writer = csv.writer(f, dialect)
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	149	writer.writerow([self.docname.encode('utf-8'),
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	150	lineno,
				151	issue.encode('utf-8'),
				152	text.strip().encode('utf-8')])
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	153	f.close()
				154
				155	def load_rules(self, filename):
				156	"""Load database of previously ignored issues.
				157
				158	A csv file, with exactly the same format as suspicious.csv
				159	Fields: document name (normalized), line number, issue, surrounding text
				160	"""
				161	self.info("loading ignore rules... ", nonl=1)
				162	self.rules = rules = []
				163	try: f = open(filename, 'rb')
				164	except IOError: return
				165	for i, row in enumerate(csv.reader(f)):
				166	if len(row) != 4:
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	167	raise ValueError(
				168	"wrong format in %s, line %d: %s" % (filename, i+1, row))
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	169	docname, lineno, issue, text = row
				170	docname = docname.decode('utf-8')
				171	if lineno: lineno = int(lineno)
				172	else: lineno = None
				173	issue = issue.decode('utf-8')
				174	text = text.decode('utf-8')
				175	rule = Rule(docname, lineno, issue, text)
				176	rules.append(rule)
				177	f.close()
				178	self.info('done, %d rules loaded' % len(self.rules))
				179
				180
				181	def get_lineno(node):
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	182	"""Obtain line number information for a node."""
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	183	lineno = None
				184	while lineno is None and node:
				185	node = node.parent
				186	lineno = node.line
				187	return lineno
				188
				189
				190	def extract_line(text, index):
				191	"""text may be a multiline string; extract
				192	only the line containing the given character index.
				193
				194	>>> extract_line("abc\ndefgh\ni", 6)
				195	>>> 'defgh'
				196	>>> for i in (0, 2, 3, 4, 10):
				197	... print extract_line("abc\ndefgh\ni", i)
				198	abc
				199	abc
				200	abc
				201	defgh
				202	defgh
				203	i
				204	"""
				205	p = text.rfind('\n', 0, index) + 1
				206	q = text.find('\n', index)
Georg Brandl	2471063	2010-10-06 10:47:20 +0000	[diff] [blame]	207	if q < 0:
				208	q = len(text)
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	209	return text[p:q]
				210
				211
				212	class SuspiciousVisitor(nodes.GenericNodeVisitor):
				213
				214	lastlineno = 0
				215
				216	def __init__(self, document, builder):
				217	nodes.GenericNodeVisitor.__init__(self, document)
				218	self.builder = builder
				219
				220	def default_visit(self, node):
				221	if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
				222	text = node.astext()
				223	# lineno seems to go backwards sometimes (?)
				224	self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
				225	seen = set() # don't report the same issue more than only once per line
				226	for match in detect_all(text):
Georg Brandl	700cf28	2009-01-04 10:23:49 +0000	[diff] [blame]	227	issue = match.group()
				228	line = extract_line(text, match.start())
				229	if (issue, line) not in seen:
				230	self.builder.check_issue(line, lineno, issue)
				231	seen.add((issue, line))
				232
				233	unknown_visit = default_visit
				234
				235	def visit_document(self, node):
				236	self.lastlineno = 0
				237
				238	def visit_comment(self, node):
				239	# ignore comments -- too much false positives.
				240	# (although doing this could miss some errors;
				241	# there were two sections "commented-out" by mistake
				242	# in the Python docs that would not be catched)
				243	raise nodes.SkipNode