Blame - Doc/tools/extensions/suspicious.py - platform/external/python/cpython3

2009-01-09 03:03:23 +0000

[diff] [blame]

1

"""

2

Try to detect suspicious constructs, resembling markup

3

that has leaked into the final output.

4

5

Suspicious lines are reported in a comma-separated-file,

6

``suspicious.csv``, located in the output directory.

7

8

The file is utf-8 encoded, and each line contains four fields:

9

10

* document name (normalized)

11

* line number in the source document

12

* problematic text

13

* complete line showing the problematic text in context

14

15

It is common to find many false positives. To avoid reporting them

16

again and again, they may be added to the ``ignored.csv`` file

17

(located in the configuration directory). The file has the same

18

format as ``suspicious.csv`` with a few differences:

19

20

- each line defines a rule; if the rule matches, the issue

21

is ignored.

22

- line number may be empty (that is, nothing between the

23

commas: ",,"). In this case, line numbers are ignored (the

24

rule matches anywhere in the file).

25

- the last field does not have to be a complete line; some

26

surrounding text (never more than a line) is enough for

27

context.

28

29

Rules are processed sequentially. A rule matches when:

30

31

* document names are the same

32

* problematic texts are the same

33

* line numbers are close to each other (5 lines up or down)

34

* the rule text is completely contained into the source line

35

36

The simplest way to create the ignored.csv file is by copying

37

undesired entries from suspicious.csv (possibly trimming the last

field.)

"""

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

44

import os

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

45

import re

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

import csv

import sys

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

49

from docutils import nodes

50

from sphinx.builders import Builder

Pablo Galindo

2018-10-15 20:07:23 +0100

[diff] [blame]

51

import sphinx.util

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

52

Georg Brandl

2010-10-29 05:30:17 +0000

[diff] [blame]

53

detect_all = re.compile(r'''

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

54

::(?=[^=])| # two :: (but NOT ::=)

55

:[a-zA-Z][a-zA-Z0-9]+| # :foo

56

`| # ` (seldom used by itself)

57

(?<!\.)\.\.[ \t]*\w+: # .. foo: (but NOT ... else:)

58

''', re.UNICODE | re.VERBOSE).finditer

59

Georg Brandl

2010-10-29 05:30:17 +0000

[diff] [blame]

60

py3 = sys.version_info >= (3, 0)

61

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

62

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

63

class Rule:

64

def __init__(self, docname, lineno, issue, line):

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

65

"""A rule for ignoring issues"""

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

66

self.docname = docname # document to which this rule applies

67

self.lineno = lineno # line number in the original source;

68

# this rule matches only near that.

69

# None -> don't care

70

self.issue = issue # the markup fragment that triggered this rule

71

self.line = line # text of the container element (single line only)

Ezio Melotti

2013-03-28 17:40:24 +0200

[diff] [blame]

self.used = False

def __repr__(self):

return '{0.docname},,{0.issue},{0.line}'.format(self)

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

76

77

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

78

79

class dialect(csv.excel):

80

"""Our dialect: uses only linefeed as newline."""

81

lineterminator = '\n'

82

83

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

84

class CheckSuspiciousMarkupBuilder(Builder):

85

"""

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

86

Checks for possibly invalid markup that may leak into the output.

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

87

"""

88

name = 'suspicious'

Pablo Galindo

2018-10-15 20:07:23 +0100

[diff] [blame]

89

logger = sphinx.util.logging.getLogger("CheckSuspiciousMarkupBuilder")

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

def init(self):

# create output file

self.log_file_name = os.path.join(self.outdir, 'suspicious.csv')

94

open(self.log_file_name, 'w').close()

95

# load database of previously ignored issues

Georg Brandl

e039029

2014-10-29 08:07:37 +0100

[diff] [blame]

96

self.load_rules(os.path.join(os.path.dirname(__file__), '..',

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

97

'susp-ignored.csv'))

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

98

99

def get_outdated_docs(self):

100

return self.env.found_docs

101

102

def get_target_uri(self, docname, typ=None):

103

return ''

104

105

def prepare_writing(self, docnames):

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

106

pass

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

107

108

def write_doc(self, docname, doctree):

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

109

# set when any issue is encountered in this document

110

self.any_issue = False

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

111

self.docname = docname

112

visitor = SuspiciousVisitor(doctree, self)

113

doctree.walk(visitor)

114

115

def finish(self):

Ezio Melotti

2013-03-28 17:40:24 +0200

[diff] [blame]

116

unused_rules = [rule for rule in self.rules if not rule.used]

117

if unused_rules:

Steve Dower

60419a7

2019-06-24 08:42:54 -0700

[diff] [blame^]

118

self.logger.warn('Found %s/%s unused rules:' %

119

(len(unused_rules), len(self.rules)))

Ezio Melotti

2013-03-28 17:40:24 +0200

[diff] [blame]

120

for rule in unused_rules:

Pablo Galindo

2018-10-15 20:07:23 +0100

[diff] [blame]

121

self.logger.info(repr(rule))

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

122

return

123

124

def check_issue(self, line, lineno, issue):

125

if not self.is_ignored(line, lineno, issue):

126

self.report_issue(line, lineno, issue)

127

128

def is_ignored(self, line, lineno, issue):

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

129

"""Determine whether this issue should be ignored."""

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

130

docname = self.docname

131

for rule in self.rules:

132

if rule.docname != docname: continue

133

if rule.issue != issue: continue

134

# Both lines must match *exactly*. This is rather strict,

135

# and probably should be improved.

136

# Doing fuzzy matches with levenshtein distance could work,

137

# but that means bringing other libraries...

138

# Ok, relax that requirement: just check if the rule fragment

139

# is contained in the document line

140

if rule.line not in line: continue

141

# Check both line numbers. If they're "near"

142

# this rule matches. (lineno=None means "don't care")

143

if (rule.lineno is not None) and \

144

abs(rule.lineno - lineno) > 5: continue

145

# if it came this far, the rule matched

Ezio Melotti

2013-03-28 17:40:24 +0200

[diff] [blame]

146

rule.used = True

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

return True

return False

def report_issue(self, text, lineno, issue):

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

151

self.any_issue = True

152

self.write_log_entry(lineno, issue, text)

Georg Brandl

2010-10-29 05:30:17 +0000

[diff] [blame]

153

if py3:

Steve Dower

60419a7

2019-06-24 08:42:54 -0700

[diff] [blame^]

154

self.logger.warn('[%s:%d] "%s" found in "%-.120s"' %

155

(self.docname, lineno, issue, text))

Georg Brandl

2010-10-29 05:30:17 +0000

[diff] [blame]

156

else:

Steve Dower

60419a7

2019-06-24 08:42:54 -0700

[diff] [blame^]

157

self.logger.warn('[%s:%d] "%s" found in "%-.120s"' % (

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

158

self.docname.encode(sys.getdefaultencoding(),'replace'),

159

lineno,

160

issue.encode(sys.getdefaultencoding(),'replace'),

161

text.strip().encode(sys.getdefaultencoding(),'replace')))

162

self.app.statuscode = 1

163

164

def write_log_entry(self, lineno, issue, text):

Georg Brandl

2010-10-29 05:30:17 +0000

[diff] [blame]

165

if py3:

166

f = open(self.log_file_name, 'a')

167

writer = csv.writer(f, dialect)

168

writer.writerow([self.docname, lineno, issue, text.strip()])

169

f.close()

170

else:

171

f = open(self.log_file_name, 'ab')

172

writer = csv.writer(f, dialect)

173

writer.writerow([self.docname.encode('utf-8'),

174

lineno,

175

issue.encode('utf-8'),

176

text.strip().encode('utf-8')])

177

f.close()

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

178

179

def load_rules(self, filename):

180

"""Load database of previously ignored issues.

181

182

A csv file, with exactly the same format as suspicious.csv

183

Fields: document name (normalized), line number, issue, surrounding text

184

"""

Pablo Galindo

2018-10-15 20:07:23 +0100

[diff] [blame]

185

self.logger.info("loading ignore rules... ", nonl=1)

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

186

self.rules = rules = []

Georg Brandl

2010-10-29 05:30:17 +0000

[diff] [blame]

187

try:

188

if py3:

189

f = open(filename, 'r')

190

else:

191

f = open(filename, 'rb')

192

except IOError:

193

return

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

194

for i, row in enumerate(csv.reader(f)):

195

if len(row) != 4:

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

196

raise ValueError(

197

"wrong format in %s, line %d: %s" % (filename, i+1, row))

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

198

docname, lineno, issue, text = row

Georg Brandl

2010-10-29 05:30:17 +0000

[diff] [blame]

if lineno:

lineno = int(lineno)

else:

lineno = None

if not py3:

docname = docname.decode('utf-8')

205

issue = issue.decode('utf-8')

206

text = text.decode('utf-8')

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

207

rule = Rule(docname, lineno, issue, text)

208

rules.append(rule)

209

f.close()

Pablo Galindo

2018-10-15 20:07:23 +0100

[diff] [blame]

210

self.logger.info('done, %d rules loaded' % len(self.rules))

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

211

212

213

def get_lineno(node):

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

214

"""Obtain line number information for a node."""

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

215

lineno = None

216

while lineno is None and node:

node = node.parent

lineno = node.line

return lineno

def extract_line(text, index):

223

"""text may be a multiline string; extract

224

only the line containing the given character index.

225

226

>>> extract_line("abc\ndefgh\ni", 6)

227

>>> 'defgh'

228

>>> for i in (0, 2, 3, 4, 10):

229

... print extract_line("abc\ndefgh\ni", i)

abc

abc

abc

defgh

defgh

i

"""

p = text.rfind('\n', 0, index) + 1

238

q = text.find('\n', index)

Georg Brandl

2010-10-06 10:35:24 +0000

[diff] [blame]

239

if q < 0:

240

q = len(text)

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

return text[p:q]

class SuspiciousVisitor(nodes.GenericNodeVisitor):

lastlineno = 0

def __init__(self, document, builder):

249

nodes.GenericNodeVisitor.__init__(self, document)

250

self.builder = builder

251

252

def default_visit(self, node):

253

if isinstance(node, (nodes.Text, nodes.image)): # direct text containers

254

text = node.astext()

255

# lineno seems to go backwards sometimes (?)

256

self.lastlineno = lineno = max(get_lineno(node) or 0, self.lastlineno)

257

seen = set() # don't report the same issue more than only once per line

258

for match in detect_all(text):

Benjamin Peterson

2009-01-09 03:03:23 +0000

[diff] [blame]

259

issue = match.group()

260

line = extract_line(text, match.start())

261

if (issue, line) not in seen:

262

self.builder.check_issue(line, lineno, issue)

263

seen.add((issue, line))

264

265

unknown_visit = default_visit

266

267

def visit_document(self, node):

268

self.lastlineno = 0

269

270

def visit_comment(self, node):

271

# ignore comments -- too much false positives.

272

# (although doing this could miss some errors;

273

# there were two sections "commented-out" by mistake

Martin Panter

2275e62

2016-06-20 07:52:50 +0000

[diff] [blame]

274

# in the Python docs that would not be caught)

Benjamin Peterson