Blame - Lib/tokenize.py - platform/external/python/cpython3

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Serhiy Storchaka

cf4a2f2

2015-03-11 17:18:03 +0200

[diff] [blame]

27

from builtins import open as _builtin_open

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

28

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

29

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

30

from io import TextIOWrapper

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

31

from itertools import chain

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

32

import itertools as _itertools

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

import re

import sys

from token import *

Serhiy Storchaka

2013-09-16 23:51:56 +0300

[diff] [blame]

37

cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

38

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

39

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

40

import token

Alexander Belopolsky

b9d10d0

2010-11-11 14:07:41 +0000

[diff] [blame]

41

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

42

"NL", "untokenize", "ENCODING", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

43

del token

44

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

45

COMMENT = N_TOKENS

46

tok_name[COMMENT] = 'COMMENT'

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

47

NL = N_TOKENS + 1

48

tok_name[NL] = 'NL'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

49

ENCODING = N_TOKENS + 2

50

tok_name[ENCODING] = 'ENCODING'

51

N_TOKENS += 3

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

52

EXACT_TOKEN_TYPES = {

'(': LPAR,

')': RPAR,

'[': LSQB,

']': RSQB,

':': COLON,

',': COMMA,

';': SEMI,

'+': PLUS,

'-': MINUS,

'*': STAR,

'/': SLASH,

'|': VBAR,

'&': AMPER,

'<': LESS,

'>': GREATER,

'=': EQUAL,

'.': DOT,

'%': PERCENT,

'{': LBRACE,

'}': RBRACE,

'==': EQEQUAL,

'!=': NOTEQUAL,

'<=': LESSEQUAL,

'>=': GREATEREQUAL,

'~': TILDE,

'^': CIRCUMFLEX,

'<<': LEFTSHIFT,

'>>': RIGHTSHIFT,

'**': DOUBLESTAR,

'+=': PLUSEQUAL,

'-=': MINEQUAL,

'*=': STAREQUAL,

'/=': SLASHEQUAL,

'%=': PERCENTEQUAL,

'&=': AMPEREQUAL,

'|=': VBAREQUAL,

'^=': CIRCUMFLEXEQUAL,

90

'<<=': LEFTSHIFTEQUAL,

91

'>>=': RIGHTSHIFTEQUAL,

92

'**=': DOUBLESTAREQUAL,

93

'//': DOUBLESLASH,

94

'//=': DOUBLESLASHEQUAL,

Benjamin Peterson

d51374e

2014-04-09 23:55:56 -0400

[diff] [blame]

95

'@': AT,

96

'@=': ATEQUAL,

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

97

}

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

98

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

99

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

100

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

101

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

102

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

103

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

104

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

105

@property

106

def exact_type(self):

107

if self.type == OP and self.string in EXACT_TOKEN_TYPES:

108

return EXACT_TOKEN_TYPES[self.string]

else:

return self.type

Eric S. Raymond

2001-02-09 11:10:16 +0000

[diff] [blame]

112

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

113

def any(*choices): return group(*choices) + '*'

114

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

115

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

116

# Note: we use unicode matching for names ("\w") but ascii matching for

117

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

118

Whitespace = r'[ \f\t]*'

119

Comment = r'#[^\r\n]*'

120

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

121

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

122

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

123

Hexnumber = r'0[xX][0-9a-fA-F]+'

Georg Brandl

fceab5a

2008-01-19 20:08:23 +0000

[diff] [blame]

124

Binnumber = r'0[bB][01]+'

125

Octnumber = r'0[oO][0-7]+'

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

126

Decnumber = r'(?:0+|[1-9][0-9]*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

127

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

128

Exponent = r'[eE][-+]?[0-9]+'

129

Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)

130

Expfloat = r'[0-9]+' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

131

Floatnumber = group(Pointfloat, Expfloat)

Antoine Pitrou

2008-08-19 17:56:33 +0000

[diff] [blame]

132

Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

133

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

134

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

135

# Return the empty string, plus all of the valid string prefixes.

136

def _all_string_prefixes():

137

# The valid string prefixes. Only contain the lower case versions,

138

# and don't contain any permuations (include 'fr', but not

139

# 'rf'). The various permutations will be generated.

140

_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']

141

# if we add binary f-strings, add: ['fb', 'fbr']

142

result = set([''])

143

for prefix in _valid_string_prefixes:

144

for t in _itertools.permutations(prefix):

145

# create a list with upper and lower versions of each

146

# character

147

for u in _itertools.product(*[(c, c.upper()) for c in t]):

148

result.add(''.join(u))

return result

def _compile(expr):

return re.compile(expr, re.UNICODE)

153

154

# Note that since _all_string_prefixes includes the empty string,

155

# StringPrefix can be the empty string (making it optional).

156

StringPrefix = group(*_all_string_prefixes())

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

157

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

158

# Tail end of ' string.

159

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

160

# Tail end of " string.

161

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

162

# Tail end of ''' string.

163

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

164

# Tail end of """ string.

165

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

166

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

167

# Single-line ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

168

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

169

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

170

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

171

# Because of leftmost-then-longest match semantics, be sure to put the

172

# longest operators first (e.g., if = came before ==, == would get

173

# recognized as two instances of =).

Guido van Rossum

b053cd8

2006-08-24 03:53:23 +0000

[diff] [blame]

174

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

175

r"//=?", r"->",

Benjamin Peterson

d51374e

2014-04-09 23:55:56 -0400

[diff] [blame]

176

r"[+\-*/%&@|^=<>]=?",

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

177

r"~")

Thomas Wouters

e1519a1

2000-08-24 21:44:52 +0000

[diff] [blame]

178

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

179

Bracket = '[][(){}]'

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

180

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

181

Funny = group(Operator, Bracket, Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

182

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

183

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

184

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

185

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

186

# First (or only) line of ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

187

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

188

group("'", r'\\\r?\n'),

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

189

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

190

group('"', r'\\\r?\n'))

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

191

PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

192

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

193

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

194

# For a given string prefix plus quotes, endpats maps it to a regex

195

# to match the remainder of that string. _prefix can be empty, for

196

# a normal single or triple quoted string (with no prefix).

197

endpats = {}

198

for _prefix in _all_string_prefixes():

199

endpats[_prefix + "'"] = Single

200

endpats[_prefix + '"'] = Double

201

endpats[_prefix + "'''"] = Single3

202

endpats[_prefix + '"""'] = Double3

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

203

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

204

# A set of all of the single and triple quoted string prefixes,

205

# including the opening quotes.

206

single_quoted = set()

207

triple_quoted = set()

208

for t in _all_string_prefixes():

209

for u in (t + '"', t + "'"):

210

single_quoted.add(u)

211

for u in (t + '"""', t + "'''"):

212

triple_quoted.add(u)

Guido van Rossum

9d6897a

2002-08-24 06:54:19 +0000

[diff] [blame]

213

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

214

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

215

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

216

class TokenError(Exception): pass

217

218

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

219

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

220

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

227

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

228

229

def add_whitespace(self, start):

230

row, col = start

Terry Jan Reedy

5e6db31

2014-02-17 16:45:48 -0500

[diff] [blame]

231

if row < self.prev_row or row == self.prev_row and col < self.prev_col:

232

raise ValueError("start ({},{}) precedes previous end ({},{})"

233

.format(row, col, self.prev_row, self.prev_col))

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

234

row_offset = row - self.prev_row

Terry Jan Reedy

f106f8f

2014-02-23 23:39:57 -0500

[diff] [blame]

235

if row_offset:

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

236

self.tokens.append("\\\n" * row_offset)

237

self.prev_col = 0

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

238

col_offset = col - self.prev_col

239

if col_offset:

240

self.tokens.append(" " * col_offset)

241

242

def untokenize(self, iterable):

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

243

it = iter(iterable)

Dingyuan Wang

e411b66

2015-06-22 10:01:12 +0800

[diff] [blame]

244

indents = []

245

startline = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

246

for t in it:

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

247

if len(t) == 2:

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

248

self.compat(t, it)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

249

break

250

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

251

if tok_type == ENCODING:

252

self.encoding = token

253

continue

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

254

if tok_type == ENDMARKER:

255

break

Dingyuan Wang

e411b66

2015-06-22 10:01:12 +0800

[diff] [blame]

256

if tok_type == INDENT:

257

indents.append(token)

258

continue

259

elif tok_type == DEDENT:

260

indents.pop()

261

self.prev_row, self.prev_col = end

262

continue

263

elif tok_type in (NEWLINE, NL):

264

startline = True

265

elif startline and indents:

266

indent = indents[-1]

267

if start[1] >= len(indent):

268

self.tokens.append(indent)

269

self.prev_col = len(indent)

270

startline = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

271

self.add_whitespace(start)

272

self.tokens.append(token)

273

self.prev_row, self.prev_col = end

274

if tok_type in (NEWLINE, NL):

275

self.prev_row += 1

276

self.prev_col = 0

277

return "".join(self.tokens)

278

279

def compat(self, token, iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

280

indents = []

281

toks_append = self.tokens.append

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

282

startline = token[0] in (NEWLINE, NL)

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

283

prevstring = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

284

285

for tok in chain([token], iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

286

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

287

if toknum == ENCODING:

288

self.encoding = tokval

289

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

290

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

291

if toknum in (NAME, NUMBER, ASYNC, AWAIT):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

292

tokval += ' '

293

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

294

# Insert a space between two consecutive strings

295

if toknum == STRING:

296

if prevstring:

297

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

302

if toknum == INDENT:

303

indents.append(tokval)

304

continue

305

elif toknum == DEDENT:

306

indents.pop()

307

continue

308

elif toknum in (NEWLINE, NL):

309

startline = True

310

elif startline and indents:

311

toks_append(indents[-1])

312

startline = False

313

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

314

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

315

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

316

def untokenize(iterable):

317

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

318

It returns a bytes object, encoded using the ENCODING

319

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

320

321

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

322

with at least two elements, a token number and token value. If

323

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

324

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

325

Round-trip invariant for full input:

326

Untokenized source will match input source exactly

327

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

328

Round-trip invariant for limited input:

329

# Output bytes will tokenize back to the input

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

330

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

331

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

332

readline = BytesIO(newcode).readline

333

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

334

assert t1 == t2

335

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

336

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

337

out = ut.untokenize(iterable)

338

if ut.encoding is not None:

339

out = out.encode(ut.encoding)

340

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

341

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

342

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

343

def _get_normal_name(orig_enc):

344

"""Imitates get_normal_name in tokenizer.c."""

345

# Only care about the first 12 characters.

346

enc = orig_enc[:12].lower().replace("_", "-")

347

if enc == "utf-8" or enc.startswith("utf-8-"):

348

return "utf-8"

349

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

350

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

354

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

355

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

356

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

357

be used to decode a Python source file. It requires one argument, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

358

in the same way as the tokenize() generator.

359

360

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

361

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

362

363

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

364

cookie as specified in pep-0263. If both a bom and a cookie are present,

365

but disagree, a SyntaxError will be raised. If the encoding cookie is an

366

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

367

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

368

369

If no encoding is specified, then the default of 'utf-8' will be returned.

370

"""

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

371

try:

372

filename = readline.__self__.name

373

except AttributeError:

374

filename = None

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

375

bom_found = False

376

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

377

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

382

return b''

383

384

def find_cookie(line):

385

try:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

386

# Decode as UTF-8. Either the line is an encoding declaration,

387

# in which case it should be pure ASCII, or it must be UTF-8

388

# per default encoding.

389

line_string = line.decode('utf-8')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

390

except UnicodeDecodeError:

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

391

msg = "invalid or missing encoding declaration"

392

if filename is not None:

393

msg = '{} for {!r}'.format(msg, filename)

394

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

395

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

396

match = cookie_re.match(line_string)

397

if not match:

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

398

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

399

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

400

try:

401

codec = lookup(encoding)

402

except LookupError:

403

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

404

if filename is None:

405

msg = "unknown encoding: " + encoding

406

else:

407

msg = "unknown encoding for {!r}: {}".format(filename,

408

encoding)

409

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

410

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

411

if bom_found:

Florent Xicluna

11f0b41

2012-07-07 12:13:35 +0200

[diff] [blame]

412

if encoding != 'utf-8':

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

413

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

414

if filename is None:

415

msg = 'encoding problem: utf-8'

416

else:

417

msg = 'encoding problem for {!r}: utf-8'.format(filename)

418

raise SyntaxError(msg)

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

419

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

420

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

421

422

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

423

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

424

bom_found = True

425

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

426

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

427

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

428

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

429

430

encoding = find_cookie(first)

431

if encoding:

432

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

433

if not blank_re.match(first):

434

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

435

436

second = read_or_stop()

437

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

438

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

439

440

encoding = find_cookie(second)

441

if encoding:

442

return encoding, [first, second]

443

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

444

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

445

446

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

447

def open(filename):

448

"""Open a file in read only mode using the encoding detected by

449

detect_encoding().

450

"""

Victor Stinner

9691750

2014-12-05 10:17:10 +0100

[diff] [blame]

451

buffer = _builtin_open(filename, 'rb')

Victor Stinner

387729e

2015-05-26 00:43:58 +0200

[diff] [blame]

452

try:

453

encoding, lines = detect_encoding(buffer.readline)

454

buffer.seek(0)

455

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

except:

buffer.close()

raise

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

461

462

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

463

def tokenize(readline):

464

"""

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

465

The tokenize() generator requires one argument, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

466

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

467

readline() method of built-in file objects. Each call to the function

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

468

should return one line of input as bytes. Alternatively, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

469

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

470

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

471

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

472

The generator produces 5-tuples with these members: the token type; the

473

token string; a 2-tuple (srow, scol) of ints specifying the row and

474

column where the token begins in the source; a 2-tuple (erow, ecol) of

475

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

476

and the line on which the token was found. The line passed is the

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

477

logical line; continuation lines are included.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

478

479

The first token sequence will always be an ENCODING token

480

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

481

"""

Benjamin Peterson

21db77e

2009-11-14 16:27:26 +0000

[diff] [blame]

482

# This import is here to avoid problems when the itertools module is not

483

# built yet and tokenize is imported.

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

484

from itertools import chain, repeat

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

485

encoding, consumed = detect_encoding(readline)

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

486

rl_gen = iter(readline, b"")

487

empty = repeat(b"")

488

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

489

490

491

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

492

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

493

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

494

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

495

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

496

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

497

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

498

# 'stashed' and 'async_*' are used for async/await parsing

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

499

stashed = None

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

500

async_def = False

501

async_def_indent = 0

502

async_def_nl = False

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

503

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

504

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

505

if encoding == "utf-8-sig":

506

# BOM will already have been stripped.

507

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

508

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Benjamin Peterson

0fe1438

2008-06-05 23:07:42 +0000

[diff] [blame]

509

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

510

try:

511

line = readline()

512

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

513

line = b''

514

515

if encoding is not None:

516

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

517

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

518

pos, max = 0, len(line)

519

520

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

521

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

522

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

523

endmatch = endprog.match(line)

524

if endmatch:

525

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

526

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

527

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

528

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

529

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

530

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

531

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

532

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

533

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

534

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

535

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

536

else:

537

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

538

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

539

continue

540

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

541

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

542

if not line: break

543

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

544

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

545

if line[pos] == ' ':

546

column += 1

547

elif line[pos] == '\t':

548

column = (column//tabsize + 1)*tabsize

549

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

556

557

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

558

if line[pos] == '#':

559

comment_token = line[pos:].rstrip('\r\n')

560

nl_pos = pos + len(comment_token)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

561

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

562

(lnum, pos), (lnum, pos + len(comment_token)), line)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

563

yield TokenInfo(NL, line[nl_pos:],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

564

(lnum, nl_pos), (lnum, len(line)), line)

565

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

566

yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

567

(lnum, pos), (lnum, len(line)), line)

568

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

569

570

if column > indents[-1]: # count indents or dedents

571

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

572

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

573

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

574

if column not in indents:

575

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

576

"unindent does not match any outer indentation level",

577

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

578

indents = indents[:-1]

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

579

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

580

if async_def and async_def_indent >= indents[-1]:

581

async_def = False

582

async_def_nl = False

583

async_def_indent = 0

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

584

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

585

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

586

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

587

if async_def and async_def_nl and async_def_indent >= indents[-1]:

async_def = False

async_def_nl = False

async_def_indent = 0

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

592

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

593

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

594

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

595

continued = 0

596

597

while pos < max:

Antoine Pitrou

10a99b0

2011-10-11 15:45:56 +0200

[diff] [blame]

598

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

599

if pseudomatch: # scan for tokens

600

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

601

spos, epos, pos = (lnum, start), (lnum, end), end

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

602

if start == end:

603

continue

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

604

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

605

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

606

if (initial in numchars or # ordinary number

607

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

608

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

609

elif initial in '\r\n':

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

610

if stashed:

611

yield stashed

612

stashed = None

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

613

if parenlev > 0:

614

yield TokenInfo(NL, token, spos, epos, line)

615

else:

616

yield TokenInfo(NEWLINE, token, spos, epos, line)

if async_def:

async_def_nl = True

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

620

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

621

assert not token.endswith("\n")

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

622

if stashed:

623

yield stashed

624

stashed = None

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

625

yield TokenInfo(COMMENT, token, spos, epos, line)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

626

Guido van Rossum

9d6897a

2002-08-24 06:54:19 +0000

[diff] [blame]

627

elif token in triple_quoted:

Antoine Pitrou

10a99b0

2011-10-11 15:45:56 +0200

[diff] [blame]

628

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

629

endmatch = endprog.match(line, pos)

630

if endmatch: # all on one line

631

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

632

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

633

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

634

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

635

strstart = (lnum, start) # multiple lines

636

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

637

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

638

break

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

639

640

# Check up to the first 3 chars of the token to see if

641

# they're in the single_quoted set. If so, they start

642

# a string.

643

# We're using the first 3, because we're looking for

644

# "rb'" (for example) at the start of the token. If

645

# we switch to longer prefixes, this needs to be

646

# adjusted.

647

# Note that initial == token[:1].

Berker Peksag

a7161e7

2015-12-30 01:42:43 +0200

[diff] [blame]

648

# Also note that single quote checking must come after

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

649

# triple quote checking (above).

650

elif (initial in single_quoted or

651

token[:2] in single_quoted or

652

token[:3] in single_quoted):

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

653

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

654

strstart = (lnum, start)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

655

# Again, using the first 3 chars of the

656

# token. This is looking for the matching end

657

# regex for the correct type of quote

658

# character. So it's really looking for

659

# endpats["'"] or endpats['"'], by trying to

660

# skip string prefix characters, if any.

661

endprog = _compile(endpats.get(initial) or

662

endpats.get(token[1]) or

663

endpats.get(token[2]))

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

664

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

665

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

666

break

667

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

668

yield TokenInfo(STRING, token, spos, epos, line)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

669

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

670

elif initial.isidentifier(): # ordinary name

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

671

if token in ('async', 'await'):

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

672

if async_def:

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

673

yield TokenInfo(

674

ASYNC if token == 'async' else AWAIT,

675

token, spos, epos, line)

676

continue

677

678

tok = TokenInfo(NAME, token, spos, epos, line)

679

if token == 'async' and not stashed:

stashed = tok

continue

if token == 'def':

if (stashed

and stashed.type == NAME

686

and stashed.string == 'async'):

687

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

688

async_def = True

689

async_def_indent = indents[-1]

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

690

691

yield TokenInfo(ASYNC, stashed.string,

692

stashed.start, stashed.end,

693

stashed.line)

694

stashed = None

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

if stashed:

yield stashed

stashed = None

yield tok

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

701

elif initial == '\\': # continued stmt

702

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

703

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

704

if initial in '([{':

705

parenlev += 1

706

elif initial in ')]}':

707

parenlev -= 1

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

708

if stashed:

709

yield stashed

710

stashed = None

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

711

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

712

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

713

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

714

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

715

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

716

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

if stashed:

yield stashed

stashed = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

721

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

722

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

723

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

724

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

725

726

# An undocumented, backwards compatible, API for all the places in the standard

727

# library that expect to be able to use tokenize with strings

728

def generate_tokens(readline):

729

return _tokenize(readline, None)

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

730

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

def main():

import argparse

# Helper error handling routines

735

def perror(message):

736

print(message, file=sys.stderr)

737

738

def error(message, filename=None, location=None):

739

if location:

740

args = (filename,) + location + (message,)

741

perror("%s:%d:%d: error: %s" % args)

742

elif filename:

743

perror("%s: error: %s" % (filename, message))

744

else:

745

perror("error: %s" % message)

746

sys.exit(1)

747

748

# Parse the arguments and options

749

parser = argparse.ArgumentParser(prog='python -m tokenize')

750

parser.add_argument(dest='filename', nargs='?',

751

metavar='filename.py',

752

help='the file to tokenize; defaults to stdin')

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

753

parser.add_argument('-e', '--exact', dest='exact', action='store_true',

754

help='display token names using the exact type')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

755

args = parser.parse_args()

try:

# Tokenize the input

if args.filename:

filename = args.filename

Victor Stinner

9691750

2014-12-05 10:17:10 +0100

[diff] [blame]

761

with _builtin_open(filename, 'rb') as f:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

762

tokens = list(tokenize(f.readline))

763

else:

764

filename = "<stdin>"

765

tokens = _tokenize(sys.stdin.readline, None)

766

767

# Output the tokenization

768

for token in tokens:

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

769

token_type = token.type

770

if args.exact:

771

token_type = token.exact_type

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

772

token_range = "%d,%d-%d,%d:" % (token.start + token.end)

773

print("%-20s%-15s%-15r" %

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

774

(token_range, tok_name[token_type], token.string))

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

775

except IndentationError as err:

776

line, column = err.args[1][1:3]

777

error(err.args[0], filename, (line, column))

778

except TokenError as err:

779

line, column = err.args[1]

780

error(err.args[0], filename, (line, column))

781

except SyntaxError as err:

782

error(err, filename)

Andrew Svetlov

f7a17b4

2012-12-25 16:47:37 +0200

[diff] [blame]

783

except OSError as err:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

784

error(err)

785

except KeyboardInterrupt:

786

print("interrupted\n")

787

except Exception as err:

788

perror("unexpected error: %s" % err)

789

raise

790

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

791

if __name__ == "__main__":

Meador Inge