Blame - Lib/tokenize.py - platform/external/python/cpython3

1992-01-01 19:34:47 +0000

[diff] [blame]

2

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

3

tokenize(readline) is a generator that breaks a stream of bytes into

4

Python tokens. It decodes the bytes according to PEP-0263 for

5

determining source file encoding.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

6

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

7

It accepts a readline-like method which is called repeatedly to get the

8

next line of input (or b"" for EOF). It generates 5-tuples with these

9

members:

Tim Peters

4efb6e9

2001-06-29 23:51:08 +0000

[diff] [blame]

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

19

operators. Additionally, all token lists start with an ENCODING token

20

which tells you which encoding was used to decode the bytes stream.

21

"""

Guido van Rossum

b51eaa1

1997-03-07 00:21:55 +0000

[diff] [blame]

22

Ka-Ping Yee

244c593

2001-03-01 13:56:40 +0000

[diff] [blame]

23

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

24

__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '

25

'Skip Montanaro, Raymond Hettinger, Trent Nelson, '

26

'Michael Foord')

Serhiy Storchaka

cf4a2f2

2015-03-11 17:18:03 +0200

[diff] [blame]

27

from builtins import open as _builtin_open

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

28

from codecs import lookup, BOM_UTF8

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

29

import collections

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

30

from io import TextIOWrapper

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

31

from itertools import chain

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

32

import itertools as _itertools

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

import re

import sys

from token import *

Serhiy Storchaka

2016-03-20 23:36:29 +0200

[diff] [blame]

37

cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

38

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

39

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

40

import token

Alexander Belopolsky

b9d10d0

2010-11-11 14:07:41 +0000

[diff] [blame]

41

__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",

42

"NL", "untokenize", "ENCODING", "TokenInfo"]

Skip Montanaro

40fc160

2001-03-01 04:27:19 +0000

[diff] [blame]

43

del token

44

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

45

COMMENT = N_TOKENS

46

tok_name[COMMENT] = 'COMMENT'

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

47

NL = N_TOKENS + 1

48

tok_name[NL] = 'NL'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

49

ENCODING = N_TOKENS + 2

50

tok_name[ENCODING] = 'ENCODING'

51

N_TOKENS += 3

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

52

EXACT_TOKEN_TYPES = {

'(': LPAR,

')': RPAR,

'[': LSQB,

']': RSQB,

':': COLON,

',': COMMA,

';': SEMI,

'+': PLUS,

'-': MINUS,

'*': STAR,

'/': SLASH,

'|': VBAR,

'&': AMPER,

'<': LESS,

'>': GREATER,

'=': EQUAL,

'.': DOT,

'%': PERCENT,

'{': LBRACE,

'}': RBRACE,

'==': EQEQUAL,

'!=': NOTEQUAL,

'<=': LESSEQUAL,

'>=': GREATEREQUAL,

'~': TILDE,

'^': CIRCUMFLEX,

'<<': LEFTSHIFT,

'>>': RIGHTSHIFT,

'**': DOUBLESTAR,

'+=': PLUSEQUAL,

'-=': MINEQUAL,

'*=': STAREQUAL,

'/=': SLASHEQUAL,

'%=': PERCENTEQUAL,

'&=': AMPEREQUAL,

'|=': VBAREQUAL,

'^=': CIRCUMFLEXEQUAL,

90

'<<=': LEFTSHIFTEQUAL,

91

'>>=': RIGHTSHIFTEQUAL,

92

'**=': DOUBLESTAREQUAL,

93

'//': DOUBLESLASH,

94

'//=': DOUBLESLASHEQUAL,

Benjamin Peterson

d51374e

2014-04-09 23:55:56 -0400

[diff] [blame]

95

'@': AT,

96

'@=': ATEQUAL,

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

97

}

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

98

Raymond Hettinger

3fb79c7

2010-09-09 07:15:18 +0000

[diff] [blame]

99

class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

100

def __repr__(self):

Raymond Hettinger

a0e7940

2010-09-09 08:29:05 +0000

[diff] [blame]

101

annotated_type = '%d (%s)' % (self.type, tok_name[self.type])

102

return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %

103

self._replace(type=annotated_type))

Raymond Hettinger

aa17a7f

2009-04-29 14:21:25 +0000

[diff] [blame]

104

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

105

@property

106

def exact_type(self):

107

if self.type == OP and self.string in EXACT_TOKEN_TYPES:

108

return EXACT_TOKEN_TYPES[self.string]

else:

return self.type

Eric S. Raymond

2001-02-09 11:10:16 +0000

[diff] [blame]

112

def group(*choices): return '(' + '|'.join(choices) + ')'

Guido van Rossum

68468eb

2003-02-27 20:14:51 +0000

[diff] [blame]

113

def any(*choices): return group(*choices) + '*'

114

def maybe(*choices): return group(*choices) + '?'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

115

Antoine Pitrou

fd03645

2008-08-19 17:56:33 +0000

[diff] [blame]

116

# Note: we use unicode matching for names ("\w") but ascii matching for

117

# number literals.

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

118

Whitespace = r'[ \f\t]*'

119

Comment = r'#[^\r\n]*'

120

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

121

Name = r'\w+'

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

122

Brett Cannon

a721aba

2016-09-09 14:57:09 -0700

[diff] [blame]

123

Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'

124

Binnumber = r'0[bB](?:_?[01])+'

125

Octnumber = r'0[oO](?:_?[0-7])+'

126

Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'

Guido van Rossum

cd16bf6

2007-06-13 18:07:49 +0000

[diff] [blame]

127

Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)

Brett Cannon

a721aba

2016-09-09 14:57:09 -0700

[diff] [blame]

128

Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'

129

Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',

130

r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)

131

Expfloat = r'[0-9](?:_?[0-9])*' + Exponent

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

132

Floatnumber = group(Pointfloat, Expfloat)

Brett Cannon

a721aba

2016-09-09 14:57:09 -0700

[diff] [blame]

133

Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

134

Number = group(Imagnumber, Floatnumber, Intnumber)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

135

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

136

# Return the empty string, plus all of the valid string prefixes.

137

def _all_string_prefixes():

138

# The valid string prefixes. Only contain the lower case versions,

139

# and don't contain any permuations (include 'fr', but not

140

# 'rf'). The various permutations will be generated.

141

_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']

142

# if we add binary f-strings, add: ['fb', 'fbr']

143

result = set([''])

144

for prefix in _valid_string_prefixes:

145

for t in _itertools.permutations(prefix):

146

# create a list with upper and lower versions of each

147

# character

148

for u in _itertools.product(*[(c, c.upper()) for c in t]):

149

result.add(''.join(u))

return result

def _compile(expr):

return re.compile(expr, re.UNICODE)

154

155

# Note that since _all_string_prefixes includes the empty string,

156

# StringPrefix can be the empty string (making it optional).

157

StringPrefix = group(*_all_string_prefixes())

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

158

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

159

# Tail end of ' string.

160

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

161

# Tail end of " string.

162

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

163

# Tail end of ''' string.

164

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

165

# Tail end of """ string.

166

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

167

Triple = group(StringPrefix + "'''", StringPrefix + '"""')

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

168

# Single-line ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

169

String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

170

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

171

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

172

# Because of leftmost-then-longest match semantics, be sure to put the

173

# longest operators first (e.g., if = came before ==, == would get

174

# recognized as two instances of =).

Guido van Rossum

b053cd8

2006-08-24 03:53:23 +0000

[diff] [blame]

175

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",

Neal Norwitz

c150536

2006-12-28 06:47:50 +0000

[diff] [blame]

176

r"//=?", r"->",

Benjamin Peterson

d51374e

2014-04-09 23:55:56 -0400

[diff] [blame]

177

r"[+\-*/%&@|^=<>]=?",

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

178

r"~")

Thomas Wouters

e1519a1

2000-08-24 21:44:52 +0000

[diff] [blame]

179

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

180

Bracket = '[][(){}]'

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

181

Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

182

Funny = group(Operator, Bracket, Special)

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

183

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

184

PlainToken = group(Number, Funny, String, Name)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

185

Token = Ignore + PlainToken

Guido van Rossum

1992-01-01 19:34:47 +0000

[diff] [blame]

186

Tim Peters

2000-10-07 05:09:39 +0000

[diff] [blame]

187

# First (or only) line of ' or " string.

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

188

ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

189

group("'", r'\\\r?\n'),

Armin Ronacher

2012-03-04 13:07:57 +0000

[diff] [blame]

190

StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Ka-Ping Yee

1ff08b1

2001-01-15 22:04:30 +0000

[diff] [blame]

191

group('"', r'\\\r?\n'))

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

192

PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

193

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

194

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

195

# For a given string prefix plus quotes, endpats maps it to a regex

196

# to match the remainder of that string. _prefix can be empty, for

197

# a normal single or triple quoted string (with no prefix).

198

endpats = {}

199

for _prefix in _all_string_prefixes():

200

endpats[_prefix + "'"] = Single

201

endpats[_prefix + '"'] = Double

202

endpats[_prefix + "'''"] = Single3

203

endpats[_prefix + '"""'] = Double3

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

204

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

205

# A set of all of the single and triple quoted string prefixes,

206

# including the opening quotes.

207

single_quoted = set()

208

triple_quoted = set()

209

for t in _all_string_prefixes():

210

for u in (t + '"', t + "'"):

211

single_quoted.add(u)

212

for u in (t + '"""', t + "'''"):

213

triple_quoted.add(u)

Guido van Rossum

9d6897a

2002-08-24 06:54:19 +0000

[diff] [blame]

214

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

215

tabsize = 8

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

216

Ka-Ping Yee

28c62bb

2001-03-23 05:22:49 +0000

[diff] [blame]

217

class TokenError(Exception): pass

218

219

class StopTokenizing(Exception): pass

Fred Drake

9b8d801

2000-08-17 04:45:13 +0000

[diff] [blame]

220

Tim Peters

5ca576e

2001-06-18 22:08:13 +0000

[diff] [blame]

221

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

228

self.encoding = None

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

229

230

def add_whitespace(self, start):

231

row, col = start

Terry Jan Reedy

5e6db31

2014-02-17 16:45:48 -0500

[diff] [blame]

232

if row < self.prev_row or row == self.prev_row and col < self.prev_col:

233

raise ValueError("start ({},{}) precedes previous end ({},{})"

234

.format(row, col, self.prev_row, self.prev_col))

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

235

row_offset = row - self.prev_row

Terry Jan Reedy

f106f8f

2014-02-23 23:39:57 -0500

[diff] [blame]

236

if row_offset:

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

237

self.tokens.append("\\\n" * row_offset)

238

self.prev_col = 0

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

239

col_offset = col - self.prev_col

240

if col_offset:

241

self.tokens.append(" " * col_offset)

242

243

def untokenize(self, iterable):

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

244

it = iter(iterable)

Dingyuan Wang

e411b66

2015-06-22 10:01:12 +0800

[diff] [blame]

245

indents = []

246

startline = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

247

for t in it:

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

248

if len(t) == 2:

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

249

self.compat(t, it)

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

250

break

251

tok_type, token, start, end, line = t

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

252

if tok_type == ENCODING:

253

self.encoding = token

254

continue

Terry Jan Reedy

9dc3a36

2014-02-23 23:33:08 -0500

[diff] [blame]

255

if tok_type == ENDMARKER:

256

break

Dingyuan Wang

e411b66

2015-06-22 10:01:12 +0800

[diff] [blame]

257

if tok_type == INDENT:

258

indents.append(token)

259

continue

260

elif tok_type == DEDENT:

261

indents.pop()

262

self.prev_row, self.prev_col = end

263

continue

264

elif tok_type in (NEWLINE, NL):

265

startline = True

266

elif startline and indents:

267

indent = indents[-1]

268

if start[1] >= len(indent):

269

self.tokens.append(indent)

270

self.prev_col = len(indent)

271

startline = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

272

self.add_whitespace(start)

273

self.tokens.append(token)

274

self.prev_row, self.prev_col = end

275

if tok_type in (NEWLINE, NL):

276

self.prev_row += 1

277

self.prev_col = 0

278

return "".join(self.tokens)

279

280

def compat(self, token, iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

281

indents = []

282

toks_append = self.tokens.append

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

283

startline = token[0] in (NEWLINE, NL)

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

284

prevstring = False

Terry Jan Reedy

2014-02-17 23:12:16 -0500

[diff] [blame]

285

286

for tok in chain([token], iterable):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

287

toknum, tokval = tok[:2]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

288

if toknum == ENCODING:

289

self.encoding = tokval

290

continue

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

291

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

292

if toknum in (NAME, NUMBER, ASYNC, AWAIT):

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

293

tokval += ' '

294

Christian Heimes

ba4af49

2008-03-28 00:55:15 +0000

[diff] [blame]

295

# Insert a space between two consecutive strings

296

if toknum == STRING:

297

if prevstring:

298

tokval = ' ' + tokval

prevstring = True

else:

prevstring = False

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

303

if toknum == INDENT:

304

indents.append(tokval)

305

continue

306

elif toknum == DEDENT:

307

indents.pop()

308

continue

309

elif toknum in (NEWLINE, NL):

310

startline = True

311

elif startline and indents:

312

toks_append(indents[-1])

313

startline = False

314

toks_append(tokval)

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

315

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

316

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

317

def untokenize(iterable):

318

"""Transform tokens back into Python source code.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

319

It returns a bytes object, encoded using the ENCODING

320

token, which is the first token sequence output by tokenize.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

321

322

Each element returned by the iterable must be a token sequence

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

323

with at least two elements, a token number and token value. If

324

only two tokens are passed, the resulting output is poor.

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

325

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

326

Round-trip invariant for full input:

327

Untokenized source will match input source exactly

328

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

329

Round-trip invariant for limited input:

330

# Output bytes will tokenize back to the input

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

331

t1 = [tok[:2] for tok in tokenize(f.readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

332

newcode = untokenize(t1)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

333

readline = BytesIO(newcode).readline

334

t2 = [tok[:2] for tok in tokenize(readline)]

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

335

assert t1 == t2

336

"""

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

337

ut = Untokenizer()

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

338

out = ut.untokenize(iterable)

339

if ut.encoding is not None:

340

out = out.encode(ut.encoding)

341

return out

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

342

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

343

Benjamin Peterson

d3afada

2009-10-09 21:43:09 +0000

[diff] [blame]

344

def _get_normal_name(orig_enc):

345

"""Imitates get_normal_name in tokenizer.c."""

346

# Only care about the first 12 characters.

347

enc = orig_enc[:12].lower().replace("_", "-")

348

if enc == "utf-8" or enc.startswith("utf-8-"):

349

return "utf-8"

350

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

351

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

355

def detect_encoding(readline):

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

356

"""

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

357

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

358

be used to decode a Python source file. It requires one argument, readline,

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

359

in the same way as the tokenize() generator.

360

361

It will call readline a maximum of twice, and return the encoding used

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

362

(as a string) and a list of any lines (left as bytes) it has read in.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

363

364

It detects the encoding from the presence of a utf-8 bom or an encoding

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

365

cookie as specified in pep-0263. If both a bom and a cookie are present,

366

but disagree, a SyntaxError will be raised. If the encoding cookie is an

367

invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

368

'utf-8-sig' is returned.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

369

370

If no encoding is specified, then the default of 'utf-8' will be returned.

371

"""

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

372

try:

373

filename = readline.__self__.name

374

except AttributeError:

375

filename = None

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

376

bom_found = False

377

encoding = None

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

378

default = 'utf-8'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

383

return b''

384

385

def find_cookie(line):

386

try:

Martin v. Löwis

63674f4

2012-04-20 14:36:47 +0200

[diff] [blame]

387

# Decode as UTF-8. Either the line is an encoding declaration,

388

# in which case it should be pure ASCII, or it must be UTF-8

389

# per default encoding.

390

line_string = line.decode('utf-8')

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

391

except UnicodeDecodeError:

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

392

msg = "invalid or missing encoding declaration"

393

if filename is not None:

394

msg = '{} for {!r}'.format(msg, filename)

395

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

396

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

397

match = cookie_re.match(line_string)

398

if not match:

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

399

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

400

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

401

try:

402

codec = lookup(encoding)

403

except LookupError:

404

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

405

if filename is None:

406

msg = "unknown encoding: " + encoding

407

else:

408

msg = "unknown encoding for {!r}: {}".format(filename,

409

encoding)

410

raise SyntaxError(msg)

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

411

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

412

if bom_found:

Florent Xicluna

11f0b41

2012-07-07 12:13:35 +0200

[diff] [blame]

413

if encoding != 'utf-8':

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

414

# This behaviour mimics the Python interpreter

Brett Cannon

2012-04-20 13:23:54 -0400

[diff] [blame]

415

if filename is None:

416

msg = 'encoding problem: utf-8'

417

else:

418

msg = 'encoding problem for {!r}: utf-8'.format(filename)

419

raise SyntaxError(msg)

Benjamin Peterson

1613ed8

2010-03-18 22:34:15 +0000

[diff] [blame]

420

encoding += '-sig'

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

421

return encoding

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

422

423

first = read_or_stop()

Benjamin Peterson

2008-12-12 01:25:05 +0000

[diff] [blame]

424

if first.startswith(BOM_UTF8):

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

425

bom_found = True

426

first = first[3:]

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

427

default = 'utf-8-sig'

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

428

if not first:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

429

return default, []

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

430

431

encoding = find_cookie(first)

432

if encoding:

433

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

434

if not blank_re.match(first):

435

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

436

437

second = read_or_stop()

438

if not second:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

439

return default, [first]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

440

441

encoding = find_cookie(second)

442

if encoding:

443

return encoding, [first, second]

444

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

445

return default, [first, second]

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

446

447

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

448

def open(filename):

449

"""Open a file in read only mode using the encoding detected by

450

detect_encoding().

451

"""

Victor Stinner

9691750

2014-12-05 10:17:10 +0100

[diff] [blame]

452

buffer = _builtin_open(filename, 'rb')

Victor Stinner

387729e

2015-05-26 00:43:58 +0200

[diff] [blame]

453

try:

454

encoding, lines = detect_encoding(buffer.readline)

455

buffer.seek(0)

456

text = TextIOWrapper(buffer, encoding, line_buffering=True)

text.mode = 'r'

return text

except:

buffer.close()

raise

Victor Stinner

58c0752

2010-11-09 01:08:59 +0000

[diff] [blame]

462

463

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

464

def tokenize(readline):

465

"""

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

466

The tokenize() generator requires one argument, readline, which

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

467

must be a callable object which provides the same interface as the

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

468

readline() method of built-in file objects. Each call to the function

Berker Peksag

ff8d087

2015-12-30 01:41:58 +0200

[diff] [blame]

469

should return one line of input as bytes. Alternatively, readline

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

470

can be a callable function terminating with StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

471

readline = open(myfile, 'rb').__next__ # Example of alternate readline

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

472

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

473

The generator produces 5-tuples with these members: the token type; the

474

token string; a 2-tuple (srow, scol) of ints specifying the row and

475

column where the token begins in the source; a 2-tuple (erow, ecol) of

476

ints specifying the row and column where the token ends in the source;

Florent Xicluna

2010-09-03 19:54:02 +0000

[diff] [blame]

477

and the line on which the token was found. The line passed is the

Tim Peters

8ac1495

2002-05-23 15:15:30 +0000

[diff] [blame]

478

logical line; continuation lines are included.

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

479

480

The first token sequence will always be an ENCODING token

481

which tells you which encoding was used to decode the bytes stream.

Raymond Hettinger

2002-05-15 02:56:03 +0000

[diff] [blame]

482

"""

Benjamin Peterson

21db77e

2009-11-14 16:27:26 +0000

[diff] [blame]

483

# This import is here to avoid problems when the itertools module is not

484

# built yet and tokenize is imported.

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

485

from itertools import chain, repeat

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

486

encoding, consumed = detect_encoding(readline)

Benjamin Peterson

81dd8b9

2009-11-14 18:09:17 +0000

[diff] [blame]

487

rl_gen = iter(readline, b"")

488

empty = repeat(b"")

489

return _tokenize(chain(consumed, rl_gen, empty).__next__, encoding)

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

490

491

492

def _tokenize(readline, encoding):

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

493

lnum = parenlev = continued = 0

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

494

numchars = '0123456789'

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

495

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

496

contline = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

497

indents = [0]

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

498

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

499

# 'stashed' and 'async_*' are used for async/await parsing

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

500

stashed = None

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

501

async_def = False

502

async_def_indent = 0

503

async_def_nl = False

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

504

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

505

if encoding is not None:

Benjamin Peterson

2010-03-18 22:29:52 +0000

[diff] [blame]

506

if encoding == "utf-8-sig":

507

# BOM will already have been stripped.

508

encoding = "utf-8"

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

509

yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')

Benjamin Peterson

0fe1438

2008-06-05 23:07:42 +0000

[diff] [blame]

510

while True: # loop over lines in stream

Raymond Hettinger

2005-06-10 11:05:19 +0000

[diff] [blame]

511

try:

512

line = readline()

513

except StopIteration:

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

514

line = b''

515

516

if encoding is not None:

517

line = line.decode(encoding)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

518

lnum += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

519

pos, max = 0, len(line)

520

521

if contstr: # continued string

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

522

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

523

raise TokenError("EOF in multi-line string", strstart)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

524

endmatch = endprog.match(line)

525

if endmatch:

526

pos = end = endmatch.end(0)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

527

yield TokenInfo(STRING, contstr + line[:end],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

528

strstart, (lnum, end), contline + line)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

529

contstr, needcont = '', 0

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

530

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

531

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

532

yield TokenInfo(ERRORTOKEN, contstr + line,

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

533

strstart, (lnum, len(line)), contline)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

534

contstr = ''

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

535

contline = None

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

536

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

537

else:

538

contstr = contstr + line

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

539

contline = contline + line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

540

continue

541

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

542

elif parenlev == 0 and not continued: # new statement

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

543

if not line: break

544

column = 0

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

545

while pos < max: # measure leading whitespace

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

546

if line[pos] == ' ':

547

column += 1

548

elif line[pos] == '\t':

549

column = (column//tabsize + 1)*tabsize

550

elif line[pos] == '\f':

column = 0

else:

break

pos += 1

if pos == max:

break

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

557

558

if line[pos] in '#\r\n': # skip comments or blank lines

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

559

if line[pos] == '#':

560

comment_token = line[pos:].rstrip('\r\n')

561

nl_pos = pos + len(comment_token)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

562

yield TokenInfo(COMMENT, comment_token,

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

563

(lnum, pos), (lnum, pos + len(comment_token)), line)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

564

yield TokenInfo(NL, line[nl_pos:],

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

565

(lnum, nl_pos), (lnum, len(line)), line)

566

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

567

yield TokenInfo((NL, COMMENT)[line[pos] == '#'], line[pos:],

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

568

(lnum, pos), (lnum, len(line)), line)

569

continue

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

570

571

if column > indents[-1]: # count indents or dedents

572

indents.append(column)

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

573

yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

574

while column < indents[-1]:

Raymond Hettinger

da99d1c

2005-06-21 07:43:58 +0000

[diff] [blame]

575

if column not in indents:

576

raise IndentationError(

Thomas Wouters

00ee7ba

2006-08-21 19:07:27 +0000

[diff] [blame]

577

"unindent does not match any outer indentation level",

578

("<tokenize>", lnum, pos, line))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

579

indents = indents[:-1]

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

580

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

581

if async_def and async_def_indent >= indents[-1]:

582

async_def = False

583

async_def_nl = False

584

async_def_indent = 0

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

585

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

586

yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

587

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

588

if async_def and async_def_nl and async_def_indent >= indents[-1]:

async_def = False

async_def_nl = False

async_def_indent = 0

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

593

else: # continued statement

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

594

if not line:

Collin Winter

ce36ad8

2007-08-30 01:19:48 +0000

[diff] [blame]

595

raise TokenError("EOF in multi-line statement", (lnum, 0))

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

596

continued = 0

597

598

while pos < max:

Antoine Pitrou

10a99b0

2011-10-11 15:45:56 +0200

[diff] [blame]

599

pseudomatch = _compile(PseudoToken).match(line, pos)

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

600

if pseudomatch: # scan for tokens

601

start, end = pseudomatch.span(1)

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

602

spos, epos, pos = (lnum, start), (lnum, end), end

Ezio Melotti

2cc3b4b

2012-11-03 17:38:43 +0200

[diff] [blame]

603

if start == end:

604

continue

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

605

token, initial = line[start:end], line[start]

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

606

Georg Brandl

dde0028

2007-03-18 19:01:53 +0000

[diff] [blame]

607

if (initial in numchars or # ordinary number

608

(initial == '.' and token != '.' and token != '...')):

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

609

yield TokenInfo(NUMBER, token, spos, epos, line)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

610

elif initial in '\r\n':

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

611

if stashed:

612

yield stashed

613

stashed = None

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

614

if parenlev > 0:

615

yield TokenInfo(NL, token, spos, epos, line)

616

else:

617

yield TokenInfo(NEWLINE, token, spos, epos, line)

if async_def:

async_def_nl = True

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

621

elif initial == '#':

Thomas Wouters

2006-12-13 04:49:30 +0000

[diff] [blame]

622

assert not token.endswith("\n")

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

623

if stashed:

624

yield stashed

625

stashed = None

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

626

yield TokenInfo(COMMENT, token, spos, epos, line)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

627

Guido van Rossum

9d6897a

2002-08-24 06:54:19 +0000

[diff] [blame]

628

elif token in triple_quoted:

Antoine Pitrou

10a99b0

2011-10-11 15:45:56 +0200

[diff] [blame]

629

endprog = _compile(endpats[token])

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

630

endmatch = endprog.match(line, pos)

631

if endmatch: # all on one line

632

pos = endmatch.end(0)

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

633

token = line[start:pos]

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

634

yield TokenInfo(STRING, token, spos, (lnum, pos), line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

635

else:

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

636

strstart = (lnum, start) # multiple lines

637

contstr = line[start:]

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

638

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

639

break

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

640

641

# Check up to the first 3 chars of the token to see if

642

# they're in the single_quoted set. If so, they start

643

# a string.

644

# We're using the first 3, because we're looking for

645

# "rb'" (for example) at the start of the token. If

646

# we switch to longer prefixes, this needs to be

647

# adjusted.

648

# Note that initial == token[:1].

Berker Peksag

a7161e7

2015-12-30 01:42:43 +0200

[diff] [blame]

649

# Also note that single quote checking must come after

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

650

# triple quote checking (above).

651

elif (initial in single_quoted or

652

token[:2] in single_quoted or

653

token[:3] in single_quoted):

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

654

if token[-1] == '\n': # continued string

Guido van Rossum

1997-04-08 14:24:39 +0000

[diff] [blame]

655

strstart = (lnum, start)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

656

# Again, using the first 3 chars of the

657

# token. This is looking for the matching end

658

# regex for the correct type of quote

659

# character. So it's really looking for

660

# endpats["'"] or endpats['"'], by trying to

661

# skip string prefix characters, if any.

662

endprog = _compile(endpats.get(initial) or

663

endpats.get(token[1]) or

664

endpats.get(token[2]))

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

665

contstr, needcont = line[start:], 1

Guido van Rossum

1998-04-03 16:05:38 +0000

[diff] [blame]

666

contline = line

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

667

break

668

else: # ordinary string

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

669

yield TokenInfo(STRING, token, spos, epos, line)

Eric V. Smith

2015-10-26 04:37:55 -0400

[diff] [blame]

670

Benjamin Peterson

2010-08-30 14:41:20 +0000

[diff] [blame]

671

elif initial.isidentifier(): # ordinary name

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

672

if token in ('async', 'await'):

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

673

if async_def:

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

674

yield TokenInfo(

675

ASYNC if token == 'async' else AWAIT,

676

token, spos, epos, line)

677

continue

678

679

tok = TokenInfo(NAME, token, spos, epos, line)

680

if token == 'async' and not stashed:

stashed = tok

continue

if token == 'def':

if (stashed

and stashed.type == NAME

687

and stashed.string == 'async'):

688

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

689

async_def = True

690

async_def_indent = indents[-1]

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

691

692

yield TokenInfo(ASYNC, stashed.string,

693

stashed.start, stashed.end,

694

stashed.line)

695

stashed = None

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

if stashed:

yield stashed

stashed = None

yield tok

Guido van Rossum

1997-10-27 20:44:15 +0000

[diff] [blame]

702

elif initial == '\\': # continued stmt

703

continued = 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

704

else:

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

705

if initial in '([{':

706

parenlev += 1

707

elif initial in ')]}':

708

parenlev -= 1

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

709

if stashed:

710

yield stashed

711

stashed = None

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

712

yield TokenInfo(OP, token, spos, epos, line)

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

713

else:

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

714

yield TokenInfo(ERRORTOKEN, line[pos],

Guido van Rossum

1997-04-09 17:15:54 +0000

[diff] [blame]

715

(lnum, pos), (lnum, pos+1), line)

Benjamin Peterson

2009-11-13 02:25:08 +0000

[diff] [blame]

716

pos += 1

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

717

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

if stashed:

yield stashed

stashed = None

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

722

for indent in indents[1:]: # pop remaining indent levels

Raymond Hettinger

2009-04-29 00:34:27 +0000

[diff] [blame]

723

yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')

724

yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')

Guido van Rossum

1997-03-07 00:21:12 +0000

[diff] [blame]

725

Trent Nelson

2008-03-18 22:41:35 +0000

[diff] [blame]

726

727

# An undocumented, backwards compatible, API for all the places in the standard

728

# library that expect to be able to use tokenize with strings

729

def generate_tokens(readline):

730

return _tokenize(readline, None)

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

731

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

def main():

import argparse

# Helper error handling routines

736

def perror(message):

737

print(message, file=sys.stderr)

738

739

def error(message, filename=None, location=None):

740

if location:

741

args = (filename,) + location + (message,)

742

perror("%s:%d:%d: error: %s" % args)

743

elif filename:

744

perror("%s: error: %s" % (filename, message))

745

else:

746

perror("error: %s" % message)

747

sys.exit(1)

748

749

# Parse the arguments and options

750

parser = argparse.ArgumentParser(prog='python -m tokenize')

751

parser.add_argument(dest='filename', nargs='?',

752

metavar='filename.py',

753

help='the file to tokenize; defaults to stdin')

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

754

parser.add_argument('-e', '--exact', dest='exact', action='store_true',

755

help='display token names using the exact type')

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

756

args = parser.parse_args()

try:

# Tokenize the input

if args.filename:

filename = args.filename

Victor Stinner

9691750

2014-12-05 10:17:10 +0100

[diff] [blame]

762

with _builtin_open(filename, 'rb') as f:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

763

tokens = list(tokenize(f.readline))

764

else:

765

filename = "<stdin>"

766

tokens = _tokenize(sys.stdin.readline, None)

767

768

# Output the tokenization

769

for token in tokens:

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

770

token_type = token.type

771

if args.exact:

772

token_type = token.exact_type

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

773

token_range = "%d,%d-%d,%d:" % (token.start + token.end)

774

print("%-20s%-15s%-15r" %

Meador Inge

2012-01-19 00:44:45 -0600

[diff] [blame]

775

(token_range, tok_name[token_type], token.string))

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

776

except IndentationError as err:

777

line, column = err.args[1][1:3]

778

error(err.args[0], filename, (line, column))

779

except TokenError as err:

780

line, column = err.args[1]

781

error(err.args[0], filename, (line, column))

782

except SyntaxError as err:

783

error(err, filename)

Andrew Svetlov

f7a17b4

2012-12-25 16:47:37 +0200

[diff] [blame]

784

except OSError as err:

Meador Inge

2011-10-07 08:53:38 -0500

[diff] [blame]

785

error(err)

786

except KeyboardInterrupt:

787

print("interrupted\n")

788

except Exception as err:

789

perror("unexpected error: %s" % err)

790

raise

791

Raymond Hettinger

6c60d09

2010-09-09 04:32:39 +0000

[diff] [blame]

792

if __name__ == "__main__":

Meador Inge