Blame - Lib/tokenize.py - platform/external/python/cpython3

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Serhiy Storchaka

cf4a2f2

2015-03-11 17:18:03 +0200

[diff] [blame]

27

from builtins import open as _builtin_open

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

28

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

29

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

30

from io import TextIOWrapper

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

31

import itertools as _itertools

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

32

import re

33

import sys

34

from token import *

Serhiy Storchaka

8ac6581

2018-12-22 11:18:40 +0200

[diff] [blame]

35

from token import EXACT_TOKEN_TYPES

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

36

Serhiy Storchaka

e431d3c

2016-03-20 23:36:29 +0200

[diff] [blame]

37

cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

38

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

39

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

40

import token

Thomas Kluyver

c56b17b

2018-06-05 19:26:39 +0200

[diff] [blame]

41

__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",

Albert-Jan Nijburg

fc354f0

2017-05-31 15:00:21 +0100

[diff] [blame]

42

"untokenize", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

43

del token

44

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

45

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

46

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

47

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

48

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

49

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

50

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

51

@property

52

def exact_type(self):

53

if self.type == OP and self.string in EXACT_TOKEN_TYPES:

54

return EXACT_TOKEN_TYPES[self.string]

else:

return self.type

Eric S. Raymond

2001-02-09 11:10:16 +0000

[diff] [blame]

58

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

59

def any(*choices): return group(*choices) + '*'

60

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

61

Antoine Pitrou

fd03645

2008-08-19 17:56:33 +0000

[diff] [blame]

62

# Note: we use unicode matching for names ("\w") but ascii matching for

63

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

64

Whitespace = r'[ \f\t]*'

65

Comment = r'#[^\r\n]*'

66

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

67

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

68

Brett Cannon

a721aba

2016-09-09 14:57:09 -0700

[diff] [blame]

69

Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'

70

Binnumber = r'0[bB](?:_?[01])+'

71

Octnumber = r'0[oO](?:_?[0-7])+'

72

Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

73

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Brett Cannon

a721aba

2016-09-09 14:57:09 -0700

[diff] [blame]

74

Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'

75

Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',

76

r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)

77

Expfloat = r'[0-9](?:_?[0-9])*' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

78

Floatnumber = group(Pointfloat, Expfloat)

Brett Cannon

a721aba

2016-09-09 14:57:09 -0700

[diff] [blame]

79

Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

80

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

81

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

82

# Return the empty string, plus all of the valid string prefixes.

83

def _all_string_prefixes():

84

# The valid string prefixes. Only contain the lower case versions,

penguindustin

9646630

2019-05-06 14:57:17 -0400

[diff] [blame]

85

# and don't contain any permutations (include 'fr', but not

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

86

# 'rf'). The various permutations will be generated.

87

_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']

88

# if we add binary f-strings, add: ['fb', 'fbr']

Jon Dufresne

3972628

2017-05-18 07:35:54 -0700

[diff] [blame]

89

result = {''}

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

90

for prefix in _valid_string_prefixes:

91

for t in _itertools.permutations(prefix):

92

# create a list with upper and lower versions of each

93

# character

94

for u in _itertools.product(*[(c, c.upper()) for c in t]):

95

result.add(''.join(u))

return result

def _compile(expr):

return re.compile(expr, re.UNICODE)

100

101

# Note that since _all_string_prefixes includes the empty string,

102

# StringPrefix can be the empty string (making it optional).

103

StringPrefix = group(*_all_string_prefixes())

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

104

Tim Peters

de49583

2000-10-07 05:09:39 +0000

[diff] [blame]

105

# Tail end of ' string.

106

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

107

# Tail end of " string.

108

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

109

# Tail end of ''' string.

110

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

111

# Tail end of """ string.

112

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

113

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

Tim Peters

de49583

2000-10-07 05:09:39 +0000

[diff] [blame]

114

# Single-line ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

115

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

116

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

117

Serhiy Storchaka

8ac6581

2018-12-22 11:18:40 +0200

[diff] [blame]

118

# Sorting in reverse order puts the long operators before their prefixes.

119

# Otherwise if = came before ==, == would get recognized as two instances

120

# of =.

121

Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))

122

Funny = group(r'\r?\n', Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

123

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

124

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

125

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

126

Tim Peters

de49583

2000-10-07 05:09:39 +0000

[diff] [blame]

127

# First (or only) line of ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

128

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

129

group("'", r'\\\r?\n'),

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

130

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

131

group('"', r'\\\r?\n'))

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

132

PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

133

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

134

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

135

# For a given string prefix plus quotes, endpats maps it to a regex

136

# to match the remainder of that string. _prefix can be empty, for

137

# a normal single or triple quoted string (with no prefix).

138

endpats = {}

139

for _prefix in _all_string_prefixes():

140

endpats[_prefix + "'"] = Single

141

endpats[_prefix + '"'] = Double

142

endpats[_prefix + "'''"] = Single3

143

endpats[_prefix + '"""'] = Double3

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

144

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

145

# A set of all of the single and triple quoted string prefixes,

146

# including the opening quotes.

147

single_quoted = set()

148

triple_quoted = set()

149

for t in _all_string_prefixes():

150

for u in (t + '"', t + "'"):

151

single_quoted.add(u)

152

for u in (t + '"""', t + "'''"):

153

triple_quoted.add(u)

Guido van Rossum

9d6897a

2002-08-24 06:54:19 +0000

[diff] [blame]

154

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

155

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

156

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

157

class TokenError(Exception): pass

158

159

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

160

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

161

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

168

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

169

170

def add_whitespace(self, start):

171

row, col = start

Terry Jan Reedy

5e6db31

2014-02-17 16:45:48 -0500

[diff] [blame]

172

if row < self.prev_row or row == self.prev_row and col < self.prev_col:

173

raise ValueError("start ({},{}) precedes previous end ({},{})"

174

.format(row, col, self.prev_row, self.prev_col))

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

175

row_offset = row - self.prev_row

Terry Jan Reedy

f106f8f

2014-02-23 23:39:57 -0500

[diff] [blame]

176

if row_offset:

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

177

self.tokens.append("\\\n" * row_offset)

178

self.prev_col = 0

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

179

col_offset = col - self.prev_col

180

if col_offset:

181

self.tokens.append(" " * col_offset)

182

183

def untokenize(self, iterable):

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

184

it = iter(iterable)

Dingyuan Wang

e411b66

2015-06-22 10:01:12 +0800

[diff] [blame]

185

indents = []

186

startline = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

187

for t in it:

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

188

if len(t) == 2:

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

189

self.compat(t, it)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

190

break

191

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

192

if tok_type == ENCODING:

193

self.encoding = token

194

continue

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

195

if tok_type == ENDMARKER:

196

break

Dingyuan Wang

e411b66

2015-06-22 10:01:12 +0800

[diff] [blame]

197

if tok_type == INDENT:

198

indents.append(token)

199

continue

200

elif tok_type == DEDENT:

201

indents.pop()

202

self.prev_row, self.prev_col = end

203

continue

204

elif tok_type in (NEWLINE, NL):

205

startline = True

206

elif startline and indents:

207

indent = indents[-1]

208

if start[1] >= len(indent):

209

self.tokens.append(indent)

210

self.prev_col = len(indent)

211

startline = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

212

self.add_whitespace(start)

213

self.tokens.append(token)

214

self.prev_row, self.prev_col = end

215

if tok_type in (NEWLINE, NL):

216

self.prev_row += 1

217

self.prev_col = 0

218

return "".join(self.tokens)

219

220

def compat(self, token, iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

221

indents = []

222

toks_append = self.tokens.append

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

223

startline = token[0] in (NEWLINE, NL)

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

224

prevstring = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

225

Łukasz Langa

2018-04-23 01:07:11 -0700

[diff] [blame]

226

for tok in _itertools.chain([token], iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

227

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

228

if toknum == ENCODING:

229

self.encoding = tokval

230

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

231

Serhiy Storchaka

d08972f

2018-04-11 19:15:51 +0300

[diff] [blame]

232

if toknum in (NAME, NUMBER):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

233

tokval += ' '

234

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

235

# Insert a space between two consecutive strings

236

if toknum == STRING:

237

if prevstring:

238

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

243

if toknum == INDENT:

244

indents.append(tokval)

245

continue

246

elif toknum == DEDENT:

247

indents.pop()

248

continue

249

elif toknum in (NEWLINE, NL):

250

startline = True

251

elif startline and indents:

252

toks_append(indents[-1])

253

startline = False

254

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

255

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

256

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

257

def untokenize(iterable):

258

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

259

It returns a bytes object, encoded using the ENCODING

260

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

261

262

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

263

with at least two elements, a token number and token value. If

264

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

265

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

266

Round-trip invariant for full input:

267

Untokenized source will match input source exactly

268

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

269

Round-trip invariant for limited input:

270

# Output bytes will tokenize back to the input

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

271

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

272

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

273

readline = BytesIO(newcode).readline

274

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

275

assert t1 == t2

276

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

277

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

278

out = ut.untokenize(iterable)

279

if ut.encoding is not None:

280

out = out.encode(ut.encoding)

281

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

282

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

283

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

284

def _get_normal_name(orig_enc):

285

"""Imitates get_normal_name in tokenizer.c."""

286

# Only care about the first 12 characters.

287

enc = orig_enc[:12].lower().replace("_", "-")

288

if enc == "utf-8" or enc.startswith("utf-8-"):

289

return "utf-8"

290

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

291

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

295

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

296

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

297

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

298

be used to decode a Python source file. It requires one argument, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

299

in the same way as the tokenize() generator.

300

301

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

302

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

303

304

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

305

cookie as specified in pep-0263. If both a bom and a cookie are present,

306

but disagree, a SyntaxError will be raised. If the encoding cookie is an

307

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

308

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

309

310

If no encoding is specified, then the default of 'utf-8' will be returned.

311

"""

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

312

try:

313

filename = readline.__self__.name

314

except AttributeError:

315

filename = None

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

316

bom_found = False

317

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

318

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

323

return b''

324

325

def find_cookie(line):

326

try:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

327

# Decode as UTF-8. Either the line is an encoding declaration,

328

# in which case it should be pure ASCII, or it must be UTF-8

329

# per default encoding.

330

line_string = line.decode('utf-8')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

331

except UnicodeDecodeError:

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

332

msg = "invalid or missing encoding declaration"

333

if filename is not None:

334

msg = '{} for {!r}'.format(msg, filename)

335

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

336

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

337

match = cookie_re.match(line_string)

338

if not match:

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

339

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

340

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

341

try:

342

codec = lookup(encoding)

343

except LookupError:

344

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

345

if filename is None:

346

msg = "unknown encoding: " + encoding

347

else:

348

msg = "unknown encoding for {!r}: {}".format(filename,

349

encoding)

350

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

351

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

352

if bom_found:

Florent Xicluna

11f0b41

2012-07-07 12:13:35 +0200

[diff] [blame]

353

if encoding != 'utf-8':

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

354

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

355

if filename is None:

356

msg = 'encoding problem: utf-8'

357

else:

358

msg = 'encoding problem for {!r}: utf-8'.format(filename)

359

raise SyntaxError(msg)

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

360

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

361

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

362

363

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

364

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

365

bom_found = True

366

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

367

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

368

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

369

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

370

371

encoding = find_cookie(first)

372

if encoding:

373

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

374

if not blank_re.match(first):

375

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

376

377

second = read_or_stop()

378

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

379

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

380

381

encoding = find_cookie(second)

382

if encoding:

383

return encoding, [first, second]

384

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

385

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

386

387

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

388

def open(filename):

389

"""Open a file in read only mode using the encoding detected by

390

detect_encoding().

391

"""

Victor Stinner

9691750

2014-12-05 10:17:10 +0100

[diff] [blame]

392

buffer = _builtin_open(filename, 'rb')

Victor Stinner

387729e

2015-05-26 00:43:58 +0200

[diff] [blame]

393

try:

394

encoding, lines = detect_encoding(buffer.readline)

395

buffer.seek(0)

396

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

except:

buffer.close()

raise

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

402

403

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

404

def tokenize(readline):

405

"""

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

406

The tokenize() generator requires one argument, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

407

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

408

readline() method of built-in file objects. Each call to the function

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

409

should return one line of input as bytes. Alternatively, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

410

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

411

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

412

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

413

The generator produces 5-tuples with these members: the token type; the

414

token string; a 2-tuple (srow, scol) of ints specifying the row and

415

column where the token begins in the source; a 2-tuple (erow, ecol) of

416

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

417

and the line on which the token was found. The line passed is the

Anthony Sottile

2a58b06

2019-05-30 15:06:32 -0700

[diff] [blame]

418

physical line.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

419

420

The first token sequence will always be an ENCODING token

421

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

422

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

423

encoding, consumed = detect_encoding(readline)

Łukasz Langa

2018-04-23 01:07:11 -0700

[diff] [blame]

424

empty = _itertools.repeat(b"")

425

rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)

426

return _tokenize(rl_gen.__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

427

428

429

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

430

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

431

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

432

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

433

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

434

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

435

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

436

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

437

if encoding == "utf-8-sig":

438

# BOM will already have been stripped.

439

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

440

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Ammar Askar

c4ef489

2018-07-06 03:19:08 -0400

[diff] [blame]

441

last_line = b''

442

line = b''

Łukasz Langa

2018-04-23 01:07:11 -0700

[diff] [blame]

443

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

444

try:

Ammar Askar

c4ef489

2018-07-06 03:19:08 -0400

[diff] [blame]

445

# We capture the value of the line variable here because

446

# readline uses the empty string '' to signal end of input,

447

# hence `line` itself will always be overwritten at the end

448

# of this loop.

449

last_line = line

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

450

line = readline()

451

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

452

line = b''

453

454

if encoding is not None:

455

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

456

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

457

pos, max = 0, len(line)

458

459

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

460

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

461

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

462

endmatch = endprog.match(line)

463

if endmatch:

464

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

465

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

466

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

467

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

468

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

469

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

470

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

471

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

472

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

473

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

474

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

475

else:

476

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

477

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

478

continue

479

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

480

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

481

if not line: break

482

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

483

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

484

if line[pos] == ' ':

485

column += 1

486

elif line[pos] == '\t':

487

column = (column//tabsize + 1)*tabsize

488

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

495

496

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

497

if line[pos] == '#':

498

comment_token = line[pos:].rstrip('\r\n')

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

499

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

500

(lnum, pos), (lnum, pos + len(comment_token)), line)

Albert-Jan Nijburg

c471ca4

2017-05-24 12:31:57 +0100

[diff] [blame]

501

pos += len(comment_token)

502

503

yield TokenInfo(NL, line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

504

(lnum, pos), (lnum, len(line)), line)

505

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

506

507

if column > indents[-1]: # count indents or dedents

508

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

509

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

510

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

511

if column not in indents:

512

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

513

"unindent does not match any outer indentation level",

514

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

515

indents = indents[:-1]

Yury Selivanov

7544508

2015-05-11 22:57:16 -0400

[diff] [blame]

516

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

517

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

518

519

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

520

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

521

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

522

continued = 0

523

524

while pos < max:

Antoine Pitrou

10a99b0

2011-10-11 15:45:56 +0200

[diff] [blame]

525

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

526

if pseudomatch: # scan for tokens

527

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

528

spos, epos, pos = (lnum, start), (lnum, end), end

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

529

if start == end:

530

continue

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

531

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

532

Łukasz Langa

2018-04-23 01:07:11 -0700

[diff] [blame]

533

if (initial in numchars or # ordinary number

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

534

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

535

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

536

elif initial in '\r\n':

Yury Selivanov

96ec934

2015-07-23 15:01:58 +0300

[diff] [blame]

537

if parenlev > 0:

538

yield TokenInfo(NL, token, spos, epos, line)

539

else:

540

yield TokenInfo(NEWLINE, token, spos, epos, line)

Yury Selivanov

96ec934

2015-07-23 15:01:58 +0300

[diff] [blame]

541

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

542

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

543

assert not token.endswith("\n")

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

544

yield TokenInfo(COMMENT, token, spos, epos, line)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

545

Guido van Rossum

9d6897a

2002-08-24 06:54:19 +0000

[diff] [blame]

546

elif token in triple_quoted:

Antoine Pitrou

10a99b0

2011-10-11 15:45:56 +0200

[diff] [blame]

547

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

548

endmatch = endprog.match(line, pos)

549

if endmatch: # all on one line

550

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

551

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

552

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

553

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

554

strstart = (lnum, start) # multiple lines

555

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

556

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

557

break

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

558

559

# Check up to the first 3 chars of the token to see if

560

# they're in the single_quoted set. If so, they start

561

# a string.

562

# We're using the first 3, because we're looking for

563

# "rb'" (for example) at the start of the token. If

564

# we switch to longer prefixes, this needs to be

565

# adjusted.

566

# Note that initial == token[:1].

Berker Peksag

a7161e7

2015-12-30 01:42:43 +0200

[diff] [blame]

567

# Also note that single quote checking must come after

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

568

# triple quote checking (above).

569

elif (initial in single_quoted or

570

token[:2] in single_quoted or

571

token[:3] in single_quoted):

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

572

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

573

strstart = (lnum, start)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

574

# Again, using the first 3 chars of the

575

# token. This is looking for the matching end

576

# regex for the correct type of quote

577

# character. So it's really looking for

578

# endpats["'"] or endpats['"'], by trying to

579

# skip string prefix characters, if any.

580

endprog = _compile(endpats.get(initial) or

581

endpats.get(token[1]) or

582

endpats.get(token[2]))

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

583

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

584

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

585

break

586

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

587

yield TokenInfo(STRING, token, spos, epos, line)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

588

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

589

elif initial.isidentifier(): # ordinary name

Jelle Zijlstra

ac31770

2017-10-05 20:24:46 -0700

[diff] [blame]

590

yield TokenInfo(NAME, token, spos, epos, line)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

591

elif initial == '\\': # continued stmt

592

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

593

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

594

if initial in '([{':

595

parenlev += 1

596

elif initial in ')]}':

597

parenlev -= 1

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

598

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

599

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

600

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

601

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

602

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

603

Ammar Askar

c4ef489

2018-07-06 03:19:08 -0400

[diff] [blame]

604

# Add an implicit NEWLINE if the input doesn't end in one

605

if last_line and last_line[-1] not in '\r\n':

606

yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

607

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

608

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

609

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

610

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

611

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

612

def generate_tokens(readline):

Thomas Kluyver

c56b17b

2018-06-05 19:26:39 +0200

[diff] [blame]

613

"""Tokenize a source reading Python code as unicode strings.

614

615

This has the same API as tokenize(), except that it expects the *readline*

616

callable to return str objects instead of bytes.

617

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

618

return _tokenize(readline, None)

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

619

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

def main():

import argparse

# Helper error handling routines

624

def perror(message):

Łukasz Langa

2018-04-23 01:07:11 -0700

[diff] [blame]

625

sys.stderr.write(message)

626

sys.stderr.write('\n')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

627

628

def error(message, filename=None, location=None):

629

if location:

630

args = (filename,) + location + (message,)

631

perror("%s:%d:%d: error: %s" % args)

632

elif filename:

633

perror("%s: error: %s" % (filename, message))

634

else:

635

perror("error: %s" % message)

636

sys.exit(1)

637

638

# Parse the arguments and options

639

parser = argparse.ArgumentParser(prog='python -m tokenize')

640

parser.add_argument(dest='filename', nargs='?',

641

metavar='filename.py',

642

help='the file to tokenize; defaults to stdin')

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

643

parser.add_argument('-e', '--exact', dest='exact', action='store_true',

644

help='display token names using the exact type')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

645

args = parser.parse_args()

try:

# Tokenize the input

if args.filename:

filename = args.filename

Victor Stinner

9691750

2014-12-05 10:17:10 +0100

[diff] [blame]

651

with _builtin_open(filename, 'rb') as f:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

652

tokens = list(tokenize(f.readline))

653

else:

654

filename = "<stdin>"

655

tokens = _tokenize(sys.stdin.readline, None)

656

657

# Output the tokenization

658

for token in tokens:

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

659

token_type = token.type

660

if args.exact:

661

token_type = token.exact_type

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

662

token_range = "%d,%d-%d,%d:" % (token.start + token.end)

663

print("%-20s%-15s%-15r" %

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

664

(token_range, tok_name[token_type], token.string))

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

665

except IndentationError as err:

666

line, column = err.args[1][1:3]

667

error(err.args[0], filename, (line, column))

668

except TokenError as err:

669

line, column = err.args[1]

670

error(err.args[0], filename, (line, column))

671

except SyntaxError as err:

672

error(err, filename)

Andrew Svetlov

f7a17b4

2012-12-25 16:47:37 +0200

[diff] [blame]

673

except OSError as err:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

674

error(err)

675

except KeyboardInterrupt:

676

print("interrupted\n")

677

except Exception as err:

678

perror("unexpected error: %s" % err)

679

raise

680

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

681

if __name__ == "__main__":

Meador Inge