Blame - Lib/tokenize.py - platform/external/python/cpython3

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Serhiy Storchaka

cf4a2f2

2015-03-11 17:18:03 +0200

[diff] [blame]

27

from builtins import open as _builtin_open

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

28

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

29

import collections

Anthony Sottile

15bd9ef

2021-01-24 01:23:17 -0800

[diff] [blame]

30

import functools

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

31

from io import TextIOWrapper

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

32

import itertools as _itertools

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

33

import re

34

import sys

35

from token import *

Serhiy Storchaka

8ac6581

2018-12-22 11:18:40 +0200

[diff] [blame]

36

from token import EXACT_TOKEN_TYPES

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

37

Serhiy Storchaka

e431d3c

2016-03-20 23:36:29 +0200

[diff] [blame]

38

cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

39

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

40

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

41

import token

Thomas Kluyver

c56b17b

2018-06-05 19:26:39 +0200

[diff] [blame]

42

__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",

Albert-Jan Nijburg

fc354f0

2017-05-31 15:00:21 +0100

[diff] [blame]

43

"untokenize", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

44

del token

45

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

46

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

47

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

48

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

49

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

50

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

51

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

52

@property

53

def exact_type(self):

54

if self.type == OP and self.string in EXACT_TOKEN_TYPES:

55

return EXACT_TOKEN_TYPES[self.string]

else:

return self.type

Eric S. Raymond

2001-02-09 11:10:16 +0000

[diff] [blame]

59

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

60

def any(*choices): return group(*choices) + '*'

61

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

62

Antoine Pitrou

fd03645

2008-08-19 17:56:33 +0000

[diff] [blame]

63

# Note: we use unicode matching for names ("\w") but ascii matching for

64

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

65

Whitespace = r'[ \f\t]*'

66

Comment = r'#[^\r\n]*'

67

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

68

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

69

Brett Cannon

a721aba

2016-09-09 14:57:09 -0700

[diff] [blame]

70

Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'

71

Binnumber = r'0[bB](?:_?[01])+'

72

Octnumber = r'0[oO](?:_?[0-7])+'

73

Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

74

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Brett Cannon

a721aba

2016-09-09 14:57:09 -0700

[diff] [blame]

75

Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'

76

Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',

77

r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)

78

Expfloat = r'[0-9](?:_?[0-9])*' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

79

Floatnumber = group(Pointfloat, Expfloat)

Brett Cannon

a721aba

2016-09-09 14:57:09 -0700

[diff] [blame]

80

Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

81

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

82

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

83

# Return the empty string, plus all of the valid string prefixes.

84

def _all_string_prefixes():

85

# The valid string prefixes. Only contain the lower case versions,

penguindustin

9646630

2019-05-06 14:57:17 -0400

[diff] [blame]

86

# and don't contain any permutations (include 'fr', but not

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

87

# 'rf'). The various permutations will be generated.

88

_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']

89

# if we add binary f-strings, add: ['fb', 'fbr']

Jon Dufresne

3972628

2017-05-18 07:35:54 -0700

[diff] [blame]

90

result = {''}

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

91

for prefix in _valid_string_prefixes:

92

for t in _itertools.permutations(prefix):

93

# create a list with upper and lower versions of each

94

# character

95

for u in _itertools.product(*[(c, c.upper()) for c in t]):

96

result.add(''.join(u))

97

return result

98

Anthony Sottile

15bd9ef

2021-01-24 01:23:17 -0800

[diff] [blame]

99

@functools.lru_cache

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

100

def _compile(expr):

101

return re.compile(expr, re.UNICODE)

102

103

# Note that since _all_string_prefixes includes the empty string,

104

# StringPrefix can be the empty string (making it optional).

105

StringPrefix = group(*_all_string_prefixes())

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

106

Tim Peters

de49583

2000-10-07 05:09:39 +0000

[diff] [blame]

107

# Tail end of ' string.

108

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

109

# Tail end of " string.

110

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

111

# Tail end of ''' string.

112

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

113

# Tail end of """ string.

114

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

115

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

Tim Peters

de49583

2000-10-07 05:09:39 +0000

[diff] [blame]

116

# Single-line ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

117

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

118

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

119

Serhiy Storchaka

8ac6581

2018-12-22 11:18:40 +0200

[diff] [blame]

120

# Sorting in reverse order puts the long operators before their prefixes.

121

# Otherwise if = came before ==, == would get recognized as two instances

122

# of =.

123

Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))

124

Funny = group(r'\r?\n', Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

125

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

126

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

127

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

128

Tim Peters

de49583

2000-10-07 05:09:39 +0000

[diff] [blame]

129

# First (or only) line of ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

130

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

131

group("'", r'\\\r?\n'),

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

132

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

133

group('"', r'\\\r?\n'))

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

134

PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

135

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

136

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

137

# For a given string prefix plus quotes, endpats maps it to a regex

138

# to match the remainder of that string. _prefix can be empty, for

139

# a normal single or triple quoted string (with no prefix).

140

endpats = {}

141

for _prefix in _all_string_prefixes():

142

endpats[_prefix + "'"] = Single

143

endpats[_prefix + '"'] = Double

144

endpats[_prefix + "'''"] = Single3

145

endpats[_prefix + '"""'] = Double3

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

146

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

147

# A set of all of the single and triple quoted string prefixes,

148

# including the opening quotes.

149

single_quoted = set()

150

triple_quoted = set()

151

for t in _all_string_prefixes():

152

for u in (t + '"', t + "'"):

153

single_quoted.add(u)

154

for u in (t + '"""', t + "'''"):

155

triple_quoted.add(u)

Guido van Rossum

9d6897a

2002-08-24 06:54:19 +0000

[diff] [blame]

156

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

157

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

158

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

159

class TokenError(Exception): pass

160

161

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

162

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

163

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

170

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

171

172

def add_whitespace(self, start):

173

row, col = start

Terry Jan Reedy

5e6db31

2014-02-17 16:45:48 -0500

[diff] [blame]

174

if row < self.prev_row or row == self.prev_row and col < self.prev_col:

175

raise ValueError("start ({},{}) precedes previous end ({},{})"

176

.format(row, col, self.prev_row, self.prev_col))

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

177

row_offset = row - self.prev_row

Terry Jan Reedy

f106f8f

2014-02-23 23:39:57 -0500

[diff] [blame]

178

if row_offset:

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

179

self.tokens.append("\\\n" * row_offset)

180

self.prev_col = 0

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

181

col_offset = col - self.prev_col

182

if col_offset:

183

self.tokens.append(" " * col_offset)

184

185

def untokenize(self, iterable):

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

186

it = iter(iterable)

Dingyuan Wang

e411b66

2015-06-22 10:01:12 +0800

[diff] [blame]

187

indents = []

188

startline = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

189

for t in it:

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

190

if len(t) == 2:

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

191

self.compat(t, it)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

192

break

193

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

194

if tok_type == ENCODING:

195

self.encoding = token

196

continue

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

197

if tok_type == ENDMARKER:

198

break

Dingyuan Wang

e411b66

2015-06-22 10:01:12 +0800

[diff] [blame]

199

if tok_type == INDENT:

200

indents.append(token)

201

continue

202

elif tok_type == DEDENT:

203

indents.pop()

204

self.prev_row, self.prev_col = end

205

continue

206

elif tok_type in (NEWLINE, NL):

207

startline = True

208

elif startline and indents:

209

indent = indents[-1]

210

if start[1] >= len(indent):

211

self.tokens.append(indent)

212

self.prev_col = len(indent)

213

startline = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

214

self.add_whitespace(start)

215

self.tokens.append(token)

216

self.prev_row, self.prev_col = end

217

if tok_type in (NEWLINE, NL):

218

self.prev_row += 1

219

self.prev_col = 0

220

return "".join(self.tokens)

221

222

def compat(self, token, iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

223

indents = []

224

toks_append = self.tokens.append

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

225

startline = token[0] in (NEWLINE, NL)

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

226

prevstring = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

227

Łukasz Langa

2018-04-23 01:07:11 -0700

[diff] [blame]

228

for tok in _itertools.chain([token], iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

229

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

230

if toknum == ENCODING:

231

self.encoding = tokval

232

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

233

Serhiy Storchaka

d08972f

2018-04-11 19:15:51 +0300

[diff] [blame]

234

if toknum in (NAME, NUMBER):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

235

tokval += ' '

236

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

237

# Insert a space between two consecutive strings

238

if toknum == STRING:

239

if prevstring:

240

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

245

if toknum == INDENT:

246

indents.append(tokval)

247

continue

248

elif toknum == DEDENT:

249

indents.pop()

250

continue

251

elif toknum in (NEWLINE, NL):

252

startline = True

253

elif startline and indents:

254

toks_append(indents[-1])

255

startline = False

256

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

257

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

258

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

259

def untokenize(iterable):

260

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

261

It returns a bytes object, encoded using the ENCODING

262

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

263

264

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

265

with at least two elements, a token number and token value. If

266

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

267

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

268

Round-trip invariant for full input:

269

Untokenized source will match input source exactly

270

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

271

Round-trip invariant for limited input:

272

# Output bytes will tokenize back to the input

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

273

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

274

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

275

readline = BytesIO(newcode).readline

276

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

277

assert t1 == t2

278

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

279

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

280

out = ut.untokenize(iterable)

281

if ut.encoding is not None:

282

out = out.encode(ut.encoding)

283

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

284

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

285

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

286

def _get_normal_name(orig_enc):

287

"""Imitates get_normal_name in tokenizer.c."""

288

# Only care about the first 12 characters.

289

enc = orig_enc[:12].lower().replace("_", "-")

290

if enc == "utf-8" or enc.startswith("utf-8-"):

291

return "utf-8"

292

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

293

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

297

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

298

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

299

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

300

be used to decode a Python source file. It requires one argument, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

301

in the same way as the tokenize() generator.

302

303

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

304

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

305

306

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

307

cookie as specified in pep-0263. If both a bom and a cookie are present,

308

but disagree, a SyntaxError will be raised. If the encoding cookie is an

309

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

310

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

311

312

If no encoding is specified, then the default of 'utf-8' will be returned.

313

"""

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

314

try:

315

filename = readline.__self__.name

316

except AttributeError:

317

filename = None

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

318

bom_found = False

319

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

320

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

325

return b''

326

327

def find_cookie(line):

328

try:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

329

# Decode as UTF-8. Either the line is an encoding declaration,

330

# in which case it should be pure ASCII, or it must be UTF-8

331

# per default encoding.

332

line_string = line.decode('utf-8')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

333

except UnicodeDecodeError:

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

334

msg = "invalid or missing encoding declaration"

335

if filename is not None:

336

msg = '{} for {!r}'.format(msg, filename)

337

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

338

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

339

match = cookie_re.match(line_string)

340

if not match:

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

341

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

342

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

343

try:

344

codec = lookup(encoding)

345

except LookupError:

346

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

347

if filename is None:

348

msg = "unknown encoding: " + encoding

349

else:

350

msg = "unknown encoding for {!r}: {}".format(filename,

351

encoding)

352

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

353

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

354

if bom_found:

Florent Xicluna

11f0b41

2012-07-07 12:13:35 +0200

[diff] [blame]

355

if encoding != 'utf-8':

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

356

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

357

if filename is None:

358

msg = 'encoding problem: utf-8'

359

else:

360

msg = 'encoding problem for {!r}: utf-8'.format(filename)

361

raise SyntaxError(msg)

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

362

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

363

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

364

365

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

366

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

367

bom_found = True

368

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

369

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

370

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

371

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

372

373

encoding = find_cookie(first)

374

if encoding:

375

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

376

if not blank_re.match(first):

377

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

378

379

second = read_or_stop()

380

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

381

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

382

383

encoding = find_cookie(second)

384

if encoding:

385

return encoding, [first, second]

386

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

387

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

388

389

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

390

def open(filename):

391

"""Open a file in read only mode using the encoding detected by

392

detect_encoding().

393

"""

Victor Stinner

9691750

2014-12-05 10:17:10 +0100

[diff] [blame]

394

buffer = _builtin_open(filename, 'rb')

Victor Stinner

387729e

2015-05-26 00:43:58 +0200

[diff] [blame]

395

try:

396

encoding, lines = detect_encoding(buffer.readline)

397

buffer.seek(0)

398

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

except:

buffer.close()

raise

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

404

405

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

406

def tokenize(readline):

407

"""

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

408

The tokenize() generator requires one argument, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

409

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

410

readline() method of built-in file objects. Each call to the function

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

411

should return one line of input as bytes. Alternatively, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

412

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

413

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

414

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

415

The generator produces 5-tuples with these members: the token type; the

416

token string; a 2-tuple (srow, scol) of ints specifying the row and

417

column where the token begins in the source; a 2-tuple (erow, ecol) of

418

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

419

and the line on which the token was found. The line passed is the

Anthony Sottile

2a58b06

2019-05-30 15:06:32 -0700

[diff] [blame]

420

physical line.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

421

422

The first token sequence will always be an ENCODING token

423

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

424

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

425

encoding, consumed = detect_encoding(readline)

Łukasz Langa

2018-04-23 01:07:11 -0700

[diff] [blame]

426

empty = _itertools.repeat(b"")

427

rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)

428

return _tokenize(rl_gen.__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

429

430

431

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

432

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

433

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

434

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

435

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

436

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

437

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

438

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

439

if encoding == "utf-8-sig":

440

# BOM will already have been stripped.

441

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

442

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Ammar Askar

c4ef489

2018-07-06 03:19:08 -0400

[diff] [blame]

443

last_line = b''

444

line = b''

Łukasz Langa

2018-04-23 01:07:11 -0700

[diff] [blame]

445

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

446

try:

Ammar Askar

c4ef489

2018-07-06 03:19:08 -0400

[diff] [blame]

447

# We capture the value of the line variable here because

448

# readline uses the empty string '' to signal end of input,

449

# hence `line` itself will always be overwritten at the end

450

# of this loop.

451

last_line = line

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

452

line = readline()

453

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

454

line = b''

455

456

if encoding is not None:

457

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

458

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

459

pos, max = 0, len(line)

460

461

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

462

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

463

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

464

endmatch = endprog.match(line)

465

if endmatch:

466

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

467

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

468

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

469

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

470

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

471

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

472

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

473

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

474

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

475

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

476

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

477

else:

478

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

479

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

480

continue

481

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

482

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

483

if not line: break

484

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

485

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

486

if line[pos] == ' ':

487

column += 1

488

elif line[pos] == '\t':

489

column = (column//tabsize + 1)*tabsize

490

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

497

498

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

499

if line[pos] == '#':

500

comment_token = line[pos:].rstrip('\r\n')

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

501

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

502

(lnum, pos), (lnum, pos + len(comment_token)), line)

Albert-Jan Nijburg

c471ca4

2017-05-24 12:31:57 +0100

[diff] [blame]

503

pos += len(comment_token)

504

505

yield TokenInfo(NL, line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

506

(lnum, pos), (lnum, len(line)), line)

507

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

508

509

if column > indents[-1]: # count indents or dedents

510

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

511

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

512

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

513

if column not in indents:

514

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

515

"unindent does not match any outer indentation level",

516

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

517

indents = indents[:-1]

Yury Selivanov

7544508

2015-05-11 22:57:16 -0400

[diff] [blame]

518

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

519

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

520

521

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

522

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

523

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

524

continued = 0

525

526

while pos < max:

Antoine Pitrou

10a99b0

2011-10-11 15:45:56 +0200

[diff] [blame]

527

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

528

if pseudomatch: # scan for tokens

529

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

530

spos, epos, pos = (lnum, start), (lnum, end), end

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

531

if start == end:

532

continue

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

533

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

534

Łukasz Langa

2018-04-23 01:07:11 -0700

[diff] [blame]

535

if (initial in numchars or # ordinary number

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

536

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

537

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

538

elif initial in '\r\n':

Yury Selivanov

96ec934

2015-07-23 15:01:58 +0300

[diff] [blame]

539

if parenlev > 0:

540

yield TokenInfo(NL, token, spos, epos, line)

541

else:

542

yield TokenInfo(NEWLINE, token, spos, epos, line)

Yury Selivanov

96ec934

2015-07-23 15:01:58 +0300

[diff] [blame]

543

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

544

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

545

assert not token.endswith("\n")

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

546

yield TokenInfo(COMMENT, token, spos, epos, line)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

547

Guido van Rossum

9d6897a

2002-08-24 06:54:19 +0000

[diff] [blame]

548

elif token in triple_quoted:

Antoine Pitrou

10a99b0

2011-10-11 15:45:56 +0200

[diff] [blame]

549

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

550

endmatch = endprog.match(line, pos)

551

if endmatch: # all on one line

552

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

553

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

554

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

555

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

556

strstart = (lnum, start) # multiple lines

557

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

558

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

559

break

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

560

561

# Check up to the first 3 chars of the token to see if

562

# they're in the single_quoted set. If so, they start

563

# a string.

564

# We're using the first 3, because we're looking for

565

# "rb'" (for example) at the start of the token. If

566

# we switch to longer prefixes, this needs to be

567

# adjusted.

568

# Note that initial == token[:1].

Berker Peksag

a7161e7

2015-12-30 01:42:43 +0200

[diff] [blame]

569

# Also note that single quote checking must come after

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

570

# triple quote checking (above).

571

elif (initial in single_quoted or

572

token[:2] in single_quoted or

573

token[:3] in single_quoted):

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

574

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

575

strstart = (lnum, start)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

576

# Again, using the first 3 chars of the

577

# token. This is looking for the matching end

578

# regex for the correct type of quote

579

# character. So it's really looking for

580

# endpats["'"] or endpats['"'], by trying to

581

# skip string prefix characters, if any.

582

endprog = _compile(endpats.get(initial) or

583

endpats.get(token[1]) or

584

endpats.get(token[2]))

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

585

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

586

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

587

break

588

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

589

yield TokenInfo(STRING, token, spos, epos, line)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

590

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

591

elif initial.isidentifier(): # ordinary name

Jelle Zijlstra

ac31770

2017-10-05 20:24:46 -0700

[diff] [blame]

592

yield TokenInfo(NAME, token, spos, epos, line)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

593

elif initial == '\\': # continued stmt

594

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

595

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

596

if initial in '([{':

597

parenlev += 1

598

elif initial in ')]}':

599

parenlev -= 1

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

600

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

601

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

602

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

603

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

604

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

605

Ammar Askar

c4ef489

2018-07-06 03:19:08 -0400

[diff] [blame]

606

# Add an implicit NEWLINE if the input doesn't end in one

607

if last_line and last_line[-1] not in '\r\n':

608

yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

609

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

610

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

611

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

612

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

613

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

614

def generate_tokens(readline):

Thomas Kluyver

c56b17b

2018-06-05 19:26:39 +0200

[diff] [blame]

615

"""Tokenize a source reading Python code as unicode strings.

616

617

This has the same API as tokenize(), except that it expects the *readline*

618

callable to return str objects instead of bytes.

619

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

620

return _tokenize(readline, None)

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

621

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

def main():

import argparse

# Helper error handling routines

626

def perror(message):

Łukasz Langa

2018-04-23 01:07:11 -0700

[diff] [blame]

627

sys.stderr.write(message)

628

sys.stderr.write('\n')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

629

630

def error(message, filename=None, location=None):

631

if location:

632

args = (filename,) + location + (message,)

633

perror("%s:%d:%d: error: %s" % args)

634

elif filename:

635

perror("%s: error: %s" % (filename, message))

636

else:

637

perror("error: %s" % message)

638

sys.exit(1)

639

640

# Parse the arguments and options

641

parser = argparse.ArgumentParser(prog='python -m tokenize')

642

parser.add_argument(dest='filename', nargs='?',

643

metavar='filename.py',

644

help='the file to tokenize; defaults to stdin')

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

645

parser.add_argument('-e', '--exact', dest='exact', action='store_true',

646

help='display token names using the exact type')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

647

args = parser.parse_args()

try:

# Tokenize the input

if args.filename:

filename = args.filename

Victor Stinner

9691750

2014-12-05 10:17:10 +0100

[diff] [blame]

653

with _builtin_open(filename, 'rb') as f:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

654

tokens = list(tokenize(f.readline))

655

else:

656

filename = "<stdin>"

657

tokens = _tokenize(sys.stdin.readline, None)

658

659

# Output the tokenization

660

for token in tokens:

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

661

token_type = token.type

662

if args.exact:

663

token_type = token.exact_type

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

664

token_range = "%d,%d-%d,%d:" % (token.start + token.end)

665

print("%-20s%-15s%-15r" %

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

666

(token_range, tok_name[token_type], token.string))

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

667

except IndentationError as err:

668

line, column = err.args[1][1:3]

669

error(err.args[0], filename, (line, column))

670

except TokenError as err:

671

line, column = err.args[1]

672

error(err.args[0], filename, (line, column))

673

except SyntaxError as err:

674

error(err, filename)

Andrew Svetlov

f7a17b4

2012-12-25 16:47:37 +0200

[diff] [blame]

675

except OSError as err:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

676

error(err)

677

except KeyboardInterrupt:

678

print("interrupted\n")

679

except Exception as err:

680

perror("unexpected error: %s" % err)

681

raise

682

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

683

if __name__ == "__main__":

Meador Inge