Blame - Doc/tools/sphinxext/suspicious.py - platform/external/python/cpython2

blob: 245a759bed921fe4d59a23129a77d635e1c722b4 [file] [log] [blame]

Benjamin Peterson	9f7ae1b	2009-01-09 03:04:01 +0000	[diff] [blame]	1	"""
				2	Try to detect suspicious constructs, resembling markup
				3	that has leaked into the final output.
				4
				5	Suspicious lines are reported in a comma-separated-file,
				6	``suspicious.csv``, located in the output directory.
				7
				8	The file is utf-8 encoded, and each line contains four fields:
				9
				10	* document name (normalized)
				11	* line number in the source document
				12	* problematic text
				13	* complete line showing the problematic text in context
				14
				15	It is common to find many false positives. To avoid reporting them
				16	again and again, they may be added to the ``ignored.csv`` file
				17	(located in the configuration directory). The file has the same
				18	format as ``suspicious.csv`` with a few differences:
				19
				20	- each line defines a rule; if the rule matches, the issue
				21	is ignored.
				22	- line number may be empty (that is, nothing between the
				23	commas: ",,"). In this case, line numbers are ignored (the
				24	rule matches anywhere in the file).
				25	- the last field does not have to be a complete line; some
				26	surrounding text (never more than a line) is enough for
				27	context.
				28
				29	Rules are processed sequentially. A rule matches when:
				30
				31	* document names are the same
				32	* problematic texts are the same
				33	* line numbers are close to each other (5 lines up or down)
				34	* the rule text is completely contained into the source line
				35
				36	The simplest way to create the ignored.csv file is by copying
				37	undesired entries from suspicious.csv (possibly trimming the last
				38	field.)
				39
				40	Copyright 2009 Gabriel A. Genellina
				41
				42	"""
				43
				44	import os, sys
				45	import csv
				46	import re
				47	from docutils import nodes
Georg Brandl	badbba4	2009-01-26 23:06:17 +0000	[diff] [blame^]	48
				49	try:
				50	from sphinx.builders import Builder
				51	except ImportError:
				52	from sphinx.builder import Builder
				53
Benjamin Peterson	9f7ae1b	2009-01-09 03:04:01 +0000	[diff] [blame]	54
				55	detect_all = re.compile(ur'''
				56	::(?=[^=])\| # two :: (but NOT ::=)
				57	:[a-zA-Z][a-zA-Z0-9]+\| # :foo
				58	`\| # ` (seldom used by itself)
				59	(?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)
				60	''', re.UNICODE \| re.VERBOSE).finditer
				61
				62	class Rule:
				63	def __init__(self, docname, lineno, issue, line):
				64	"A rule for ignoring issues"
				65	self.docname = docname # document to which this rule applies
				66	self.lineno = lineno # line number in the original source;
				67	# this rule matches only near that.
				68	# None -> don't care
				69	self.issue = issue # the markup fragment that triggered this rule
				70	self.line = line # text of the container element (single line only)
				71
				72
				73	class CheckSuspiciousMarkupBuilder(Builder):
				74	"""
				75	Checks for possibly invalid markup that may leak into the output
				76	"""
				77	name = 'suspicious'
				78
				79	def init(self):
				80	# create output file
				81	self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')
				82	open(self.log_file_name, 'w').close()
				83	# load database of previously ignored issues
				84	self.load_rules(os.path.join(os.path.dirname(__file__), 'susp-ignored.csv'))
				85
				86	def get_outdated_docs(self):
				87	return self.env.found_docs
				88
				89	def get_target_uri(self, docname, typ=None):
				90	return ''
				91
				92	def prepare_writing(self, docnames):
				93	### PYTHON PROJECT SPECIFIC ###
				94	for name in set(docnames):
				95	if name.split('/', 1)[0] == 'documenting':
				96	docnames.remove(name)
				97	### PYTHON PROJECT SPECIFIC ###
				98
				99	def write_doc(self, docname, doctree):
				100	self.any_issue = False # set when any issue is encountered in this document
				101	self.docname = docname
				102	visitor = SuspiciousVisitor(doctree, self)
				103	doctree.walk(visitor)
				104
				105	def finish(self):
				106	return
				107
				108	def check_issue(self, line, lineno, issue):
				109	if not self.is_ignored(line, lineno, issue):
				110	self.report_issue(line, lineno, issue)
				111
				112	def is_ignored(self, line, lineno, issue):
				113	"""Determine whether this issue should be ignored.
				114	"""
				115	docname = self.docname
				116	for rule in self.rules:
				117	if rule.docname != docname: continue
				118	if rule.issue != issue: continue
				119	# Both lines must match exactly. This is rather strict,
				120	# and probably should be improved.
				121	# Doing fuzzy matches with levenshtein distance could work,
				122	# but that means bringing other libraries...
				123	# Ok, relax that requirement: just check if the rule fragment
				124	# is contained in the document line
				125	if rule.line not in line: continue
				126	# Check both line numbers. If they're "near"
				127	# this rule matches. (lineno=None means "don't care")
				128	if (rule.lineno is not None) and \
				129	abs(rule.lineno - lineno) > 5: continue
				130	# if it came this far, the rule matched
				131	return True
				132	return False
				133
				134	def report_issue(self, text, lineno, issue):
				135	if not self.any_issue: self.info()
				136	self.any_issue = True
				137	self.write_log_entry(lineno, issue, text)
				138	self.warn('[%s:%d] "%s" found in "%-.120s"' % (
				139	self.docname.encode(sys.getdefaultencoding(),'replace'),
				140	lineno,
				141	issue.encode(sys.getdefaultencoding(),'replace'),
				142	text.strip().encode(sys.getdefaultencoding(),'replace')))
				143	self.app.statuscode = 1
				144
				145	def write_log_entry(self, lineno, issue, text):
				146	f = open(self.log_file_name, 'ab')
				147	writer = csv.writer(f)
				148	writer.writerow([self.docname.encode('utf-8'),
				149	lineno,
				150	issue.encode('utf-8'),
				151	text.strip().encode('utf-8')])
				152	del writer
				153	f.close()
				154
				155	def load_rules(self, filename):
				156	"""Load database of previously ignored issues.
				157
				158	A csv file, with exactly the same format as suspicious.csv
				159	Fields: document name (normalized), line number, issue, surrounding text
				160	"""
				161	self.info("loading ignore rules... ", nonl=1)
				162	self.rules = rules = []
				163	try: f = open(filename, 'rb')
				164	except IOError: return
				165	for i, row in enumerate(csv.reader(f)):
				166	if len(row) != 4:
				167	raise ValueError, "wrong format in %s, line %d: %s" % (filename, i+1, row)
				168	docname, lineno, issue, text = row
				169	docname = docname.decode('utf-8')
				170	if lineno: lineno = int(lineno)
				171	else: lineno = None
				172	issue = issue.decode('utf-8')
				173	text = text.decode('utf-8')
				174	rule = Rule(docname, lineno, issue, text)
				175	rules.append(rule)
				176	f.close()
				177	self.info('done, %d rules loaded' % len(self.rules))
				178
				179
				180	def get_lineno(node):
				181	"Obtain line number information for a node"
				182	lineno = None
				183	while lineno is None and node:
				184	node = node.parent
				185	lineno = node.line
				186	return lineno
				187
				188
				189	def extract_line(text, index):
				190	"""text may be a multiline string; extract
				191	only the line containing the given character index.
				192
				193	>>> extract_line("abc\ndefgh\ni", 6)
				194	>>> 'defgh'
				195	>>> for i in (0, 2, 3, 4, 10):
				196	... print extract_line("abc\ndefgh\ni", i)
				197	abc
				198	abc
				199	abc
				200	defgh
				201	defgh
				202	i
				203	"""
				204	p = text.rfind('\n', 0, index) + 1
				205	q = text.find('\n', index)
				206	if q<0: q = len(text)
				207	return text[p:q]
				208
				209
				210	class SuspiciousVisitor(nodes.GenericNodeVisitor):
				211
				212	lastlineno = 0
				213
				214	def __init__(self, document, builder):
				215	nodes.GenericNodeVisitor.__init__(self, document)
				216	self.builder = builder
				217
				218	def default_visit(self, node):
				219	if isinstance(node, (nodes.Text, nodes.image)): # direct text containers
				220	text = node.astext()
				221	# lineno seems to go backwards sometimes (?)
				222	self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)
				223	seen = set() # don't report the same issue more than only once per line
				224	for match in detect_all(text):
				225	#import pdb; pdb.set_trace()
				226	issue = match.group()
				227	line = extract_line(text, match.start())
				228	if (issue, line) not in seen:
				229	self.builder.check_issue(line, lineno, issue)
				230	seen.add((issue, line))
				231
				232	unknown_visit = default_visit
				233
				234	def visit_document(self, node):
				235	self.lastlineno = 0
				236
				237	def visit_comment(self, node):
				238	# ignore comments -- too much false positives.
				239	# (although doing this could miss some errors;
				240	# there were two sections "commented-out" by mistake
				241	# in the Python docs that would not be catched)
				242	raise nodes.SkipNode