Blame - Lib/tokenize.py - platform/external/python/cpython3

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Serhiy Storchaka

cf4a2f2

2015-03-11 17:18:03 +0200

[diff] [blame]

27

from builtins import open as _builtin_open

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

28

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

29

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

30

from io import TextIOWrapper

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

31

from itertools import chain

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

32

import itertools as _itertools

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

import re

import sys

from token import *

Serhiy Storchaka

2016-03-20 23:36:29 +0200

[diff] [blame]

37

cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

38

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

39

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

40

import token

Albert-Jan Nijburg

fc354f0

2017-05-31 15:00:21 +0100

[diff] [blame]

41

__all__ = token.__all__ + ["tokenize", "detect_encoding",

42

"untokenize", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

43

del token

44

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

45

EXACT_TOKEN_TYPES = {

'(': LPAR,

')': RPAR,

'[': LSQB,

']': RSQB,

':': COLON,

',': COMMA,

';': SEMI,

'+': PLUS,

'-': MINUS,

'*': STAR,

'/': SLASH,

'|': VBAR,

'&': AMPER,

'<': LESS,

'>': GREATER,

'=': EQUAL,

'.': DOT,

'%': PERCENT,

'{': LBRACE,

'}': RBRACE,

'==': EQEQUAL,

'!=': NOTEQUAL,

'<=': LESSEQUAL,

'>=': GREATEREQUAL,

'~': TILDE,

'^': CIRCUMFLEX,

'<<': LEFTSHIFT,

'>>': RIGHTSHIFT,

'**': DOUBLESTAR,

'+=': PLUSEQUAL,

'-=': MINEQUAL,

'*=': STAREQUAL,

'/=': SLASHEQUAL,

'%=': PERCENTEQUAL,

'&=': AMPEREQUAL,

'|=': VBAREQUAL,

Jim Fasarakis-Hilliard

d4914e9

2017-03-14 22:16:15 +0200

[diff] [blame]

82

'^=': CIRCUMFLEXEQUAL,

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

83

'<<=': LEFTSHIFTEQUAL,

84

'>>=': RIGHTSHIFTEQUAL,

85

'**=': DOUBLESTAREQUAL,

86

'//': DOUBLESLASH,

87

'//=': DOUBLESLASHEQUAL,

Jim Fasarakis-Hilliard

d4914e9

2017-03-14 22:16:15 +0200

[diff] [blame]

88

'...': ELLIPSIS,

89

'->': RARROW,

Benjamin Peterson

d51374e

2014-04-09 23:55:56 -0400

[diff] [blame]

90

'@': AT,

91

'@=': ATEQUAL,

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

92

}

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

93

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

94

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

95

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

96

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

97

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

98

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

99

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

100

@property

101

def exact_type(self):

102

if self.type == OP and self.string in EXACT_TOKEN_TYPES:

103

return EXACT_TOKEN_TYPES[self.string]

else:

return self.type

Eric S. Raymond

2001-02-09 11:10:16 +0000

[diff] [blame]

107

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

108

def any(*choices): return group(*choices) + '*'

109

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

110

Antoine Pitrou

fd03645

2008-08-19 17:56:33 +0000

[diff] [blame]

111

# Note: we use unicode matching for names ("\w") but ascii matching for

112

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

113

Whitespace = r'[ \f\t]*'

114

Comment = r'#[^\r\n]*'

115

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

116

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

117

Brett Cannon

a721aba

2016-09-09 14:57:09 -0700

[diff] [blame]

118

Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'

119

Binnumber = r'0[bB](?:_?[01])+'

120

Octnumber = r'0[oO](?:_?[0-7])+'

121

Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

122

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Brett Cannon

a721aba

2016-09-09 14:57:09 -0700

[diff] [blame]

123

Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'

124

Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',

125

r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)

126

Expfloat = r'[0-9](?:_?[0-9])*' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

127

Floatnumber = group(Pointfloat, Expfloat)

Brett Cannon

a721aba

2016-09-09 14:57:09 -0700

[diff] [blame]

128

Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

129

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

130

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

131

# Return the empty string, plus all of the valid string prefixes.

132

def _all_string_prefixes():

133

# The valid string prefixes. Only contain the lower case versions,

134

# and don't contain any permuations (include 'fr', but not

135

# 'rf'). The various permutations will be generated.

136

_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']

137

# if we add binary f-strings, add: ['fb', 'fbr']

Jon Dufresne

3972628

2017-05-18 07:35:54 -0700

[diff] [blame]

138

result = {''}

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

139

for prefix in _valid_string_prefixes:

140

for t in _itertools.permutations(prefix):

141

# create a list with upper and lower versions of each

142

# character

143

for u in _itertools.product(*[(c, c.upper()) for c in t]):

144

result.add(''.join(u))

return result

def _compile(expr):

return re.compile(expr, re.UNICODE)

149

150

# Note that since _all_string_prefixes includes the empty string,

151

# StringPrefix can be the empty string (making it optional).

152

StringPrefix = group(*_all_string_prefixes())

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

153

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

154

# Tail end of ' string.

155

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

156

# Tail end of " string.

157

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

158

# Tail end of ''' string.

159

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

160

# Tail end of """ string.

161

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

162

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

163

# Single-line ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

164

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

165

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

166

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

167

# Because of leftmost-then-longest match semantics, be sure to put the

168

# longest operators first (e.g., if = came before ==, == would get

169

# recognized as two instances of =).

Guido van Rossum

b053cd8

2006-08-24 03:53:23 +0000

[diff] [blame]

170

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

171

r"//=?", r"->",

Benjamin Peterson

d51374e

2014-04-09 23:55:56 -0400

[diff] [blame]

172

r"[+\-*/%&@|^=<>]=?",

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

173

r"~")

Thomas Wouters

e1519a1

2000-08-24 21:44:52 +0000

[diff] [blame]

174

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

175

Bracket = '[][(){}]'

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

176

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

177

Funny = group(Operator, Bracket, Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

178

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

179

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

180

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

181

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

182

# First (or only) line of ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

183

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

184

group("'", r'\\\r?\n'),

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

185

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

186

group('"', r'\\\r?\n'))

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

187

PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

188

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

189

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

190

# For a given string prefix plus quotes, endpats maps it to a regex

191

# to match the remainder of that string. _prefix can be empty, for

192

# a normal single or triple quoted string (with no prefix).

193

endpats = {}

194

for _prefix in _all_string_prefixes():

195

endpats[_prefix + "'"] = Single

196

endpats[_prefix + '"'] = Double

197

endpats[_prefix + "'''"] = Single3

198

endpats[_prefix + '"""'] = Double3

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

199

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

200

# A set of all of the single and triple quoted string prefixes,

201

# including the opening quotes.

202

single_quoted = set()

203

triple_quoted = set()

204

for t in _all_string_prefixes():

205

for u in (t + '"', t + "'"):

206

single_quoted.add(u)

207

for u in (t + '"""', t + "'''"):

208

triple_quoted.add(u)

Guido van Rossum

9d6897a

2002-08-24 06:54:19 +0000

[diff] [blame]

209

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

210

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

211

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

212

class TokenError(Exception): pass

213

214

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

215

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

216

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

223

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

224

225

def add_whitespace(self, start):

226

row, col = start

Terry Jan Reedy

5e6db31

2014-02-17 16:45:48 -0500

[diff] [blame]

227

if row < self.prev_row or row == self.prev_row and col < self.prev_col:

228

raise ValueError("start ({},{}) precedes previous end ({},{})"

229

.format(row, col, self.prev_row, self.prev_col))

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

230

row_offset = row - self.prev_row

Terry Jan Reedy

f106f8f

2014-02-23 23:39:57 -0500

[diff] [blame]

231

if row_offset:

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

232

self.tokens.append("\\\n" * row_offset)

233

self.prev_col = 0

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

234

col_offset = col - self.prev_col

235

if col_offset:

236

self.tokens.append(" " * col_offset)

237

238

def untokenize(self, iterable):

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

239

it = iter(iterable)

Dingyuan Wang

e411b66

2015-06-22 10:01:12 +0800

[diff] [blame]

240

indents = []

241

startline = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

242

for t in it:

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

243

if len(t) == 2:

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

244

self.compat(t, it)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

245

break

246

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

247

if tok_type == ENCODING:

248

self.encoding = token

249

continue

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

250

if tok_type == ENDMARKER:

251

break

Dingyuan Wang

e411b66

2015-06-22 10:01:12 +0800

[diff] [blame]

252

if tok_type == INDENT:

253

indents.append(token)

254

continue

255

elif tok_type == DEDENT:

256

indents.pop()

257

self.prev_row, self.prev_col = end

258

continue

259

elif tok_type in (NEWLINE, NL):

260

startline = True

261

elif startline and indents:

262

indent = indents[-1]

263

if start[1] >= len(indent):

264

self.tokens.append(indent)

265

self.prev_col = len(indent)

266

startline = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

267

self.add_whitespace(start)

268

self.tokens.append(token)

269

self.prev_row, self.prev_col = end

270

if tok_type in (NEWLINE, NL):

271

self.prev_row += 1

272

self.prev_col = 0

273

return "".join(self.tokens)

274

275

def compat(self, token, iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

276

indents = []

277

toks_append = self.tokens.append

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

278

startline = token[0] in (NEWLINE, NL)

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

279

prevstring = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

280

281

for tok in chain([token], iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

282

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

283

if toknum == ENCODING:

284

self.encoding = tokval

285

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

286

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

287

if toknum in (NAME, NUMBER, ASYNC, AWAIT):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

288

tokval += ' '

289

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

290

# Insert a space between two consecutive strings

291

if toknum == STRING:

292

if prevstring:

293

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

298

if toknum == INDENT:

299

indents.append(tokval)

300

continue

301

elif toknum == DEDENT:

302

indents.pop()

303

continue

304

elif toknum in (NEWLINE, NL):

305

startline = True

306

elif startline and indents:

307

toks_append(indents[-1])

308

startline = False

309

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

310

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

311

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

312

def untokenize(iterable):

313

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

314

It returns a bytes object, encoded using the ENCODING

315

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

316

317

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

318

with at least two elements, a token number and token value. If

319

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

320

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

321

Round-trip invariant for full input:

322

Untokenized source will match input source exactly

323

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

324

Round-trip invariant for limited input:

325

# Output bytes will tokenize back to the input

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

326

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

327

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

328

readline = BytesIO(newcode).readline

329

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

330

assert t1 == t2

331

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

332

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

333

out = ut.untokenize(iterable)

334

if ut.encoding is not None:

335

out = out.encode(ut.encoding)

336

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

337

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

338

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

339

def _get_normal_name(orig_enc):

340

"""Imitates get_normal_name in tokenizer.c."""

341

# Only care about the first 12 characters.

342

enc = orig_enc[:12].lower().replace("_", "-")

343

if enc == "utf-8" or enc.startswith("utf-8-"):

344

return "utf-8"

345

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

346

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

350

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

351

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

352

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

353

be used to decode a Python source file. It requires one argument, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

354

in the same way as the tokenize() generator.

355

356

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

357

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

358

359

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

360

cookie as specified in pep-0263. If both a bom and a cookie are present,

361

but disagree, a SyntaxError will be raised. If the encoding cookie is an

362

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

363

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

364

365

If no encoding is specified, then the default of 'utf-8' will be returned.

366

"""

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

367

try:

368

filename = readline.__self__.name

369

except AttributeError:

370

filename = None

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

371

bom_found = False

372

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

373

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

378

return b''

379

380

def find_cookie(line):

381

try:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

382

# Decode as UTF-8. Either the line is an encoding declaration,

383

# in which case it should be pure ASCII, or it must be UTF-8

384

# per default encoding.

385

line_string = line.decode('utf-8')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

386

except UnicodeDecodeError:

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

387

msg = "invalid or missing encoding declaration"

388

if filename is not None:

389

msg = '{} for {!r}'.format(msg, filename)

390

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

391

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

392

match = cookie_re.match(line_string)

393

if not match:

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

394

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

395

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

396

try:

397

codec = lookup(encoding)

398

except LookupError:

399

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

400

if filename is None:

401

msg = "unknown encoding: " + encoding

402

else:

403

msg = "unknown encoding for {!r}: {}".format(filename,

404

encoding)

405

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

406

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

407

if bom_found:

Florent Xicluna

11f0b41

2012-07-07 12:13:35 +0200

[diff] [blame]

408

if encoding != 'utf-8':

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

409

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

410

if filename is None:

411

msg = 'encoding problem: utf-8'

412

else:

413

msg = 'encoding problem for {!r}: utf-8'.format(filename)

414

raise SyntaxError(msg)

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

415

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

416

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

417

418

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

419

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

420

bom_found = True

421

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

422

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

423

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

424

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

425

426

encoding = find_cookie(first)

427

if encoding:

428

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

429

if not blank_re.match(first):

430

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

431

432

second = read_or_stop()

433

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

434

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

435

436

encoding = find_cookie(second)

437

if encoding:

438

return encoding, [first, second]

439

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

440

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

441

442

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

443

def open(filename):

444

"""Open a file in read only mode using the encoding detected by

445

detect_encoding().

446

"""

Victor Stinner

9691750

2014-12-05 10:17:10 +0100

[diff] [blame]

447

buffer = _builtin_open(filename, 'rb')

Victor Stinner

387729e

2015-05-26 00:43:58 +0200

[diff] [blame]

448

try:

449

encoding, lines = detect_encoding(buffer.readline)

450

buffer.seek(0)

451

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

except:

buffer.close()

raise

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

457

458

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

459

def tokenize(readline):

460

"""

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

461

The tokenize() generator requires one argument, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

462

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

463

readline() method of built-in file objects. Each call to the function

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

464

should return one line of input as bytes. Alternatively, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

465

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

466

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

467

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

468

The generator produces 5-tuples with these members: the token type; the

469

token string; a 2-tuple (srow, scol) of ints specifying the row and

470

column where the token begins in the source; a 2-tuple (erow, ecol) of

471

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

472

and the line on which the token was found. The line passed is the

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

473

logical line; continuation lines are included.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

474

475

The first token sequence will always be an ENCODING token

476

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

477

"""

Benjamin Peterson

21db77e

2009-11-14 16:27:26 +0000

[diff] [blame]

478

# This import is here to avoid problems when the itertools module is not

479

# built yet and tokenize is imported.

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

480

from itertools import chain, repeat

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

481

encoding, consumed = detect_encoding(readline)

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

482

rl_gen = iter(readline, b"")

483

empty = repeat(b"")

484

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

485

486

487

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

488

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

489

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

490

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

491

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

492

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

493

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

494

# 'stashed' and 'async_*' are used for async/await parsing

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

495

stashed = None

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

496

async_def = False

497

async_def_indent = 0

498

async_def_nl = False

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

499

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

500

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

501

if encoding == "utf-8-sig":

502

# BOM will already have been stripped.

503

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

504

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Benjamin Peterson

0fe1438

2008-06-05 23:07:42 +0000

[diff] [blame]

505

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

506

try:

507

line = readline()

508

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

509

line = b''

510

511

if encoding is not None:

512

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

513

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

514

pos, max = 0, len(line)

515

516

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

517

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

518

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

519

endmatch = endprog.match(line)

520

if endmatch:

521

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

522

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

523

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

524

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

525

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

526

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

527

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

528

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

529

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

530

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

531

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

532

else:

533

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

534

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

535

continue

536

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

537

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

538

if not line: break

539

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

540

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

541

if line[pos] == ' ':

542

column += 1

543

elif line[pos] == '\t':

544

column = (column//tabsize + 1)*tabsize

545

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

552

553

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

554

if line[pos] == '#':

555

comment_token = line[pos:].rstrip('\r\n')

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

556

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

557

(lnum, pos), (lnum, pos + len(comment_token)), line)

Albert-Jan Nijburg

c471ca4

2017-05-24 12:31:57 +0100

[diff] [blame]

558

pos += len(comment_token)

559

560

yield TokenInfo(NL, line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

561

(lnum, pos), (lnum, len(line)), line)

562

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

563

564

if column > indents[-1]: # count indents or dedents

565

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

566

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

567

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

568

if column not in indents:

569

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

570

"unindent does not match any outer indentation level",

571

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

572

indents = indents[:-1]

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

573

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

574

if async_def and async_def_indent >= indents[-1]:

575

async_def = False

576

async_def_nl = False

577

async_def_indent = 0

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

578

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

579

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

580

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

581

if async_def and async_def_nl and async_def_indent >= indents[-1]:

async_def = False

async_def_nl = False

async_def_indent = 0

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

586

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

587

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

588

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

589

continued = 0

590

591

while pos < max:

Antoine Pitrou

10a99b0

2011-10-11 15:45:56 +0200

[diff] [blame]

592

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

593

if pseudomatch: # scan for tokens

594

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

595

spos, epos, pos = (lnum, start), (lnum, end), end

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

596

if start == end:

597

continue

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

598

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

599

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

600

if (initial in numchars or # ordinary number

601

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

602

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

603

elif initial in '\r\n':

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

604

if stashed:

605

yield stashed

606

stashed = None

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

607

if parenlev > 0:

608

yield TokenInfo(NL, token, spos, epos, line)

609

else:

610

yield TokenInfo(NEWLINE, token, spos, epos, line)

if async_def:

async_def_nl = True

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

614

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

615

assert not token.endswith("\n")

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

616

if stashed:

617

yield stashed

618

stashed = None

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

619

yield TokenInfo(COMMENT, token, spos, epos, line)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

620

Guido van Rossum

9d6897a

2002-08-24 06:54:19 +0000

[diff] [blame]

621

elif token in triple_quoted:

Antoine Pitrou

10a99b0

2011-10-11 15:45:56 +0200

[diff] [blame]

622

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

623

endmatch = endprog.match(line, pos)

624

if endmatch: # all on one line

625

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

626

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

627

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

628

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

629

strstart = (lnum, start) # multiple lines

630

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

631

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

632

break

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

633

634

# Check up to the first 3 chars of the token to see if

635

# they're in the single_quoted set. If so, they start

636

# a string.

637

# We're using the first 3, because we're looking for

638

# "rb'" (for example) at the start of the token. If

639

# we switch to longer prefixes, this needs to be

640

# adjusted.

641

# Note that initial == token[:1].

Berker Peksag

a7161e7

2015-12-30 01:42:43 +0200

[diff] [blame]

642

# Also note that single quote checking must come after

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

643

# triple quote checking (above).

644

elif (initial in single_quoted or

645

token[:2] in single_quoted or

646

token[:3] in single_quoted):

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

647

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

648

strstart = (lnum, start)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

649

# Again, using the first 3 chars of the

650

# token. This is looking for the matching end

651

# regex for the correct type of quote

652

# character. So it's really looking for

653

# endpats["'"] or endpats['"'], by trying to

654

# skip string prefix characters, if any.

655

endprog = _compile(endpats.get(initial) or

656

endpats.get(token[1]) or

657

endpats.get(token[2]))

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

658

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

659

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

660

break

661

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

662

yield TokenInfo(STRING, token, spos, epos, line)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

663

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

664

elif initial.isidentifier(): # ordinary name

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

665

if token in ('async', 'await'):

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

666

if async_def:

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

667

yield TokenInfo(

668

ASYNC if token == 'async' else AWAIT,

669

token, spos, epos, line)

670

continue

671

672

tok = TokenInfo(NAME, token, spos, epos, line)

673

if token == 'async' and not stashed:

stashed = tok

continue

if token == 'def':

if (stashed

and stashed.type == NAME

680

and stashed.string == 'async'):

681

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

682

async_def = True

683

async_def_indent = indents[-1]

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

684

685

yield TokenInfo(ASYNC, stashed.string,

686

stashed.start, stashed.end,

687

stashed.line)

688

stashed = None

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

if stashed:

yield stashed

stashed = None

yield tok

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

695

elif initial == '\\': # continued stmt

696

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

697

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

698

if initial in '([{':

699

parenlev += 1

700

elif initial in ')]}':

701

parenlev -= 1

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

702

if stashed:

703

yield stashed

704

stashed = None

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

705

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

706

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

707

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

708

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

709

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

710

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

if stashed:

yield stashed

stashed = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

715

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

716

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

717

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

718

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

719

720

# An undocumented, backwards compatible, API for all the places in the standard

721

# library that expect to be able to use tokenize with strings

722

def generate_tokens(readline):

723

return _tokenize(readline, None)

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

724

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

def main():

import argparse

# Helper error handling routines

729

def perror(message):

730

print(message, file=sys.stderr)

731

732

def error(message, filename=None, location=None):

733

if location:

734

args = (filename,) + location + (message,)

735

perror("%s:%d:%d: error: %s" % args)

736

elif filename:

737

perror("%s: error: %s" % (filename, message))

738

else:

739

perror("error: %s" % message)

740

sys.exit(1)

741

742

# Parse the arguments and options

743

parser = argparse.ArgumentParser(prog='python -m tokenize')

744

parser.add_argument(dest='filename', nargs='?',

745

metavar='filename.py',

746

help='the file to tokenize; defaults to stdin')

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

747

parser.add_argument('-e', '--exact', dest='exact', action='store_true',

748

help='display token names using the exact type')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

749

args = parser.parse_args()

try:

# Tokenize the input

if args.filename:

filename = args.filename

Victor Stinner

9691750

2014-12-05 10:17:10 +0100

[diff] [blame]

755

with _builtin_open(filename, 'rb') as f:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

756

tokens = list(tokenize(f.readline))

757

else:

758

filename = "<stdin>"

759

tokens = _tokenize(sys.stdin.readline, None)

760

761

# Output the tokenization

762

for token in tokens:

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

763

token_type = token.type

764

if args.exact:

765

token_type = token.exact_type

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

766

token_range = "%d,%d-%d,%d:" % (token.start + token.end)

767

print("%-20s%-15s%-15r" %

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

768

(token_range, tok_name[token_type], token.string))

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

769

except IndentationError as err:

770

line, column = err.args[1][1:3]

771

error(err.args[0], filename, (line, column))

772

except TokenError as err:

773

line, column = err.args[1]

774

error(err.args[0], filename, (line, column))

775

except SyntaxError as err:

776

error(err, filename)

Andrew Svetlov

f7a17b4

2012-12-25 16:47:37 +0200

[diff] [blame]

777

except OSError as err:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

778

error(err)

779

except KeyboardInterrupt:

780

print("interrupted\n")

781

except Exception as err:

782

perror("unexpected error: %s" % err)

783

raise

784

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

785

if __name__ == "__main__":

Meador Inge