Blame - Lib/lib2to3/pgen2/tokenize.py - platform/external/python/cpython3

2008-03-19 05:04:44 +0000

[diff] [blame]

"""Tokenization help for Python programs.

5

6

generate_tokens(readline) is a generator that breaks a stream of

7

text into Python tokens. It accepts a readline-like method which is called

8

repeatedly to get the next line of input (or "" for EOF). It generates

9

5-tuples with these members:

10

11

the token type (see token.py)

12

the token (a string)

13

the starting (row, column) indices of the token (a 2-tuple of ints)

14

the ending (row, column) indices of the token (a 2-tuple of ints)

15

the original line (string)

16

17

It is designed to match the working of the Python tokenizer exactly, except

18

that it produces COMMENT tokens for comments and gives type OP for all

operators

Older entry points

tokenize_loop(readline, tokeneater)

23

tokenize(readline, tokeneater=printtoken)

24

are the same, except instead of generating tokens, tokeneater is a callback

25

function to which the 5 fields described above are passed as 5 arguments,

26

each time a new token is found."""

27

28

__author__ = 'Ka-Ping Yee <ping@lfw.org>'

29

__credits__ = \

30

'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'

31

32

import string, re

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

33

from codecs import BOM_UTF8, lookup

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

34

from lib2to3.pgen2.token import *

35

36

from . import token

37

__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",

38

"generate_tokens", "untokenize"]

39

del token

40

Benjamin Peterson

8d26b0b

2010-05-07 19:10:11 +0000

[diff] [blame]

try:

bytes

except NameError:

# Support bytes type in Python <= 2.5, so 2to3 turns itself into

45

# valid Python 3 code.

46

bytes = str

47

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

48

def group(*choices): return '(' + '|'.join(choices) + ')'

49

def any(*choices): return group(*choices) + '*'

50

def maybe(*choices): return group(*choices) + '?'

51

52

Whitespace = r'[ \f\t]*'

53

Comment = r'#[^\r\n]*'

54

Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)

55

Name = r'[a-zA-Z_]\w*'

56

Nevada Sanchez

a6e395d

2017-04-13 13:32:54 -0400

[diff] [blame]

57

Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'

58

Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'

59

Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'

60

Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

61

Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)

Nevada Sanchez

a6e395d

2017-04-13 13:32:54 -0400

[diff] [blame]

62

Exponent = r'[eE][-+]?\d+(?:_\d+)*'

63

Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)

64

Expfloat = r'\d+(?:_\d+)*' + Exponent

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

65

Floatnumber = group(Pointfloat, Expfloat)

Nevada Sanchez

a6e395d

2017-04-13 13:32:54 -0400

[diff] [blame]

66

Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

67

Number = group(Imagnumber, Floatnumber, Intnumber)

68

69

# Tail end of ' string.

70

Single = r"[^'\\]*(?:\\.[^'\\]*)*'"

71

# Tail end of " string.

72

Double = r'[^"\\]*(?:\\.[^"\\]*)*"'

73

# Tail end of ''' string.

74

Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"

75

# Tail end of """ string.

76

Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'

Łukasz Langa

2017-05-22 16:35:48 -0700

[diff] [blame^]

77

_litprefix = r"(?:[uUrRbBfF]|[rR][bB]|[bBuU][rR])?"

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

78

Triple = group(_litprefix + "'''", _litprefix + '"""')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

79

# Single-line ' or " string.

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

80

String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",

81

_litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

82

83

# Because of leftmost-then-longest match semantics, be sure to put the

84

# longest operators first (e.g., if = came before ==, == would get

85

# recognized as two instances of =).

86

Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",

87

r"//=?", r"->",

Benjamin Peterson

4ab92c8

2014-04-10 00:12:47 -0400

[diff] [blame]

88

r"[+\-*/%&@|^=<>]=?",

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

r"~")

Bracket = '[][(){}]'

Special = group(r'\r?\n', r'[:;.,`@]')

93

Funny = group(Operator, Bracket, Special)

94

95

PlainToken = group(Number, Funny, String, Name)

96

Token = Ignore + PlainToken

97

98

# First (or only) line of ' or " string.

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

99

ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

100

group("'", r'\\\r?\n'),

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

101

_litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

102

group('"', r'\\\r?\n'))

103

PseudoExtras = group(r'\\\r?\n', Comment, Triple)

104

PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)

105

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

106

tokenprog, pseudoprog, single3prog, double3prog = list(map(

107

re.compile, (Token, PseudoToken, Single3, Double3)))

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

108

endprogs = {"'": re.compile(Single), '"': re.compile(Double),

109

"'''": single3prog, '"""': double3prog,

110

"r'''": single3prog, 'r"""': double3prog,

111

"u'''": single3prog, 'u"""': double3prog,

112

"b'''": single3prog, 'b"""': double3prog,

Łukasz Langa

2017-05-22 16:35:48 -0700

[diff] [blame^]

113

"f'''": single3prog, 'f"""': double3prog,

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

114

"ur'''": single3prog, 'ur"""': double3prog,

115

"br'''": single3prog, 'br"""': double3prog,

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

116

"rb'''": single3prog, 'rb"""': double3prog,

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

117

"R'''": single3prog, 'R"""': double3prog,

118

"U'''": single3prog, 'U"""': double3prog,

119

"B'''": single3prog, 'B"""': double3prog,

Łukasz Langa

2017-05-22 16:35:48 -0700

[diff] [blame^]

120

"F'''": single3prog, 'F"""': double3prog,

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

121

"uR'''": single3prog, 'uR"""': double3prog,

122

"Ur'''": single3prog, 'Ur"""': double3prog,

123

"UR'''": single3prog, 'UR"""': double3prog,

124

"bR'''": single3prog, 'bR"""': double3prog,

125

"Br'''": single3prog, 'Br"""': double3prog,

126

"BR'''": single3prog, 'BR"""': double3prog,

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

127

"rB'''": single3prog, 'rB"""': double3prog,

128

"Rb'''": single3prog, 'Rb"""': double3prog,

129

"RB'''": single3prog, 'RB"""': double3prog,

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

130

'r': None, 'R': None,

131

'u': None, 'U': None,

Łukasz Langa

2017-05-22 16:35:48 -0700

[diff] [blame^]

132

'f': None, 'F': None,

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

133

'b': None, 'B': None}

134

135

triple_quoted = {}

136

for t in ("'''", '"""',

137

"r'''", 'r"""', "R'''", 'R"""',

138

"u'''", 'u"""', "U'''", 'U"""',

139

"b'''", 'b"""', "B'''", 'B"""',

Łukasz Langa

2017-05-22 16:35:48 -0700

[diff] [blame^]

140

"f'''", 'f"""', "F'''", 'F"""',

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

141

"ur'''", 'ur"""', "Ur'''", 'Ur"""',

142

"uR'''", 'uR"""', "UR'''", 'UR"""',

143

"br'''", 'br"""', "Br'''", 'Br"""',

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

144

"bR'''", 'bR"""', "BR'''", 'BR"""',

145

"rb'''", 'rb"""', "Rb'''", 'Rb"""',

146

"rB'''", 'rB"""', "RB'''", 'RB"""',):

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

triple_quoted[t] = t

single_quoted = {}

for t in ("'", '"',

"r'", 'r"', "R'", 'R"',

151

"u'", 'u"', "U'", 'U"',

152

"b'", 'b"', "B'", 'B"',

Łukasz Langa

2017-05-22 16:35:48 -0700

[diff] [blame^]

153

"f'", 'f"', "F'", 'F"',

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

154

"ur'", 'ur"', "Ur'", 'Ur"',

155

"uR'", 'uR"', "UR'", 'UR"',

156

"br'", 'br"', "Br'", 'Br"',

Łukasz Langa

2017-05-22 15:19:09 -0700

[diff] [blame]

157

"bR'", 'bR"', "BR'", 'BR"',

158

"rb'", 'rb"', "Rb'", 'Rb"',

159

"rB'", 'rB"', "RB'", 'RB"',):

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

single_quoted[t] = t

tabsize = 8

class TokenError(Exception): pass

165

166

class StopTokenizing(Exception): pass

167

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

168

def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing

169

(srow, scol) = xxx_todo_changeme

170

(erow, ecol) = xxx_todo_changeme1

171

print("%d,%d-%d,%d:\t%s\t%s" % \

172

(srow, scol, erow, ecol, tok_name[type], repr(token)))

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

173

174

def tokenize(readline, tokeneater=printtoken):

175

"""

176

The tokenize() function accepts two parameters: one representing the

177

input stream, and one providing an output mechanism for tokenize().

178

179

The first parameter, readline, must be a callable object which provides

180

the same interface as the readline() method of built-in file objects.

181

Each call to the function should return one line of input as a string.

182

183

The second parameter, tokeneater, must also be a callable object. It is

184

called once for each token, with five arguments, corresponding to the

185

tuples generated by generate_tokens().

186

"""

187

try:

188

tokenize_loop(readline, tokeneater)

189

except StopTokenizing:

190

pass

191

192

# backwards compatible interface

193

def tokenize_loop(readline, tokeneater):

194

for token_info in generate_tokens(readline):

195

tokeneater(*token_info)

class Untokenizer:

def __init__(self):

self.tokens = []

self.prev_row = 1

self.prev_col = 0

def add_whitespace(self, start):

205

row, col = start

206

assert row <= self.prev_row

207

col_offset = col - self.prev_col

208

if col_offset:

209

self.tokens.append(" " * col_offset)

210

211

def untokenize(self, iterable):

212

for t in iterable:

213

if len(t) == 2:

214

self.compat(t, iterable)

215

break

216

tok_type, token, start, end, line = t

217

self.add_whitespace(start)

218

self.tokens.append(token)

219

self.prev_row, self.prev_col = end

220

if tok_type in (NEWLINE, NL):

221

self.prev_row += 1

222

self.prev_col = 0

223

return "".join(self.tokens)

224

225

def compat(self, token, iterable):

226

startline = False

227

indents = []

228

toks_append = self.tokens.append

229

toknum, tokval = token

230

if toknum in (NAME, NUMBER):

231

tokval += ' '

232

if toknum in (NEWLINE, NL):

233

startline = True

234

for tok in iterable:

235

toknum, tokval = tok[:2]

236

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

237

if toknum in (NAME, NUMBER, ASYNC, AWAIT):

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

tokval += ' '

if toknum == INDENT:

indents.append(tokval)

242

continue

243

elif toknum == DEDENT:

244

indents.pop()

245

continue

246

elif toknum in (NEWLINE, NL):

247

startline = True

248

elif startline and indents:

249

toks_append(indents[-1])

startline = False

toks_append(tokval)

Serhiy Storchaka

2016-03-20 23:36:29 +0200

[diff] [blame]

253

cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

254

blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

255

Benjamin Peterson

d9af52b

2009-11-02 18:16:28 +0000

[diff] [blame]

256

def _get_normal_name(orig_enc):

257

"""Imitates get_normal_name in tokenizer.c."""

258

# Only care about the first 12 characters.

259

enc = orig_enc[:12].lower().replace("_", "-")

260

if enc == "utf-8" or enc.startswith("utf-8-"):

261

return "utf-8"

262

if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \

263

enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):

return "iso-8859-1"

return orig_enc

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

267

def detect_encoding(readline):

268

"""

269

The detect_encoding() function is used to detect the encoding that should

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

270

be used to decode a Python source file. It requires one argument, readline,

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

271

in the same way as the tokenize() generator.

272

273

It will call readline a maximum of twice, and return the encoding used

274

(as a string) and a list of any lines (left as bytes) it has read

275

in.

276

277

It detects the encoding from the presence of a utf-8 bom or an encoding

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

278

cookie as specified in pep-0263. If both a bom and a cookie are present, but

279

disagree, a SyntaxError will be raised. If the encoding cookie is an invalid

280

charset, raise a SyntaxError. Note that if a utf-8 bom is found,

281

'utf-8-sig' is returned.

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

282

283

If no encoding is specified, then the default of 'utf-8' will be returned.

284

"""

285

bom_found = False

286

encoding = None

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

287

default = 'utf-8'

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

def read_or_stop():

try:

return readline()

except StopIteration:

Benjamin Peterson

8d26b0b

2010-05-07 19:10:11 +0000

[diff] [blame]

292

return bytes()

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

293

294

def find_cookie(line):

295

try:

296

line_string = line.decode('ascii')

297

except UnicodeDecodeError:

298

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

299

match = cookie_re.match(line_string)

300

if not match:

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

301

return None

Serhiy Storchaka

dafea85

2013-09-16 23:51:56 +0300

[diff] [blame]

302

encoding = _get_normal_name(match.group(1))

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

303

try:

304

codec = lookup(encoding)

305

except LookupError:

306

# This behaviour mimics the Python interpreter

307

raise SyntaxError("unknown encoding: " + encoding)

308

Benjamin Peterson

2021100

2009-11-25 18:34:42 +0000

[diff] [blame]

309

if bom_found:

310

if codec.name != 'utf-8':

311

# This behaviour mimics the Python interpreter

312

raise SyntaxError('encoding problem: utf-8')

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

313

encoding += '-sig'

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

314

return encoding

315

316

first = read_or_stop()

317

if first.startswith(BOM_UTF8):

318

bom_found = True

319

first = first[3:]

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

320

default = 'utf-8-sig'

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

321

if not first:

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

322

return default, []

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

323

324

encoding = find_cookie(first)

325

if encoding:

326

return encoding, [first]

Serhiy Storchaka

768c16c

2014-01-09 18:36:09 +0200

[diff] [blame]

327

if not blank_re.match(first):

328

return default, [first]

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

329

330

second = read_or_stop()

331

if not second:

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

332

return default, [first]

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

333

334

encoding = find_cookie(second)

335

if encoding:

336

return encoding, [first, second]

337

Benjamin Peterson

2010-03-23 03:22:05 +0000

[diff] [blame]

338

return default, [first, second]

Benjamin Peterson

2009-05-09 19:42:23 +0000

[diff] [blame]

339

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

340

def untokenize(iterable):

341

"""Transform tokens back into Python source code.

342

343

Each element returned by the iterable must be a token sequence

344

with at least two elements, a token number and token value. If

345

only two tokens are passed, the resulting output is poor.

346

347

Round-trip invariant for full input:

348

Untokenized source will match input source exactly

349

350

Round-trip invariant for limited intput:

351

# Output text will tokenize the back to the input

352

t1 = [tok[:2] for tok in generate_tokens(f.readline)]

353

newcode = untokenize(t1)

354

readline = iter(newcode.splitlines(1)).next

355

t2 = [tok[:2] for tokin generate_tokens(readline)]

assert t1 == t2

"""

ut = Untokenizer()

return ut.untokenize(iterable)

360

361

def generate_tokens(readline):

362

"""

Ezio Melotti

4bcc796

2013-11-25 05:14:51 +0200

[diff] [blame]

363

The generate_tokens() generator requires one argument, readline, which

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

364

must be a callable object which provides the same interface as the

365

readline() method of built-in file objects. Each call to the function

366

should return one line of input as a string. Alternately, readline

367

can be a callable function terminating with StopIteration:

368

readline = open(myfile).next # Example of alternate readline

369

370

The generator produces 5-tuples with these members: the token type; the

371

token string; a 2-tuple (srow, scol) of ints specifying the row and

372

column where the token begins in the source; a 2-tuple (erow, ecol) of

373

ints specifying the row and column where the token ends in the source;

374

and the line on which the token was found. The line passed is the

375

logical line; continuation lines are included.

376

"""

377

lnum = parenlev = continued = 0

378

namechars, numchars = string.ascii_letters + '_', '0123456789'

379

contstr, needcont = '', 0

contline = None

indents = [0]

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

383

# 'stashed' and 'async_*' are used for async/await parsing

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

384

stashed = None

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

385

async_def = False

386

async_def_indent = 0

387

async_def_nl = False

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

388

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

389

while 1: # loop over lines in stream

390

try:

391

line = readline()

392

except StopIteration:

393

line = ''

394

lnum = lnum + 1

395

pos, max = 0, len(line)

396

397

if contstr: # continued string

398

if not line:

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

399

raise TokenError("EOF in multi-line string", strstart)

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

400

endmatch = endprog.match(line)

401

if endmatch:

402

pos = end = endmatch.end(0)

403

yield (STRING, contstr + line[:end],

404

strstart, (lnum, end), contline + line)

405

contstr, needcont = '', 0

406

contline = None

407

elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':

408

yield (ERRORTOKEN, contstr + line,

409

strstart, (lnum, len(line)), contline)

contstr = ''

contline = None

continue

else:

contstr = contstr + line

415

contline = contline + line

416

continue

417

418

elif parenlev == 0 and not continued: # new statement

419

if not line: break

420

column = 0

421

while pos < max: # measure leading whitespace

422

if line[pos] == ' ': column = column + 1

Benjamin Peterson

d9af52b

2009-11-02 18:16:28 +0000

[diff] [blame]

423

elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

424

elif line[pos] == '\f': column = 0

else: break

pos = pos + 1

if pos == max: break

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

if stashed:

yield stashed

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

433

if line[pos] in '#\r\n': # skip comments or blank lines

434

if line[pos] == '#':

435

comment_token = line[pos:].rstrip('\r\n')

436

nl_pos = pos + len(comment_token)

437

yield (COMMENT, comment_token,

438

(lnum, pos), (lnum, pos + len(comment_token)), line)

439

yield (NL, line[nl_pos:],

440

(lnum, nl_pos), (lnum, len(line)), line)

441

else:

442

yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],

443

(lnum, pos), (lnum, len(line)), line)

444

continue

445

446

if column > indents[-1]: # count indents or dedents

447

indents.append(column)

448

yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)

449

while column < indents[-1]:

450

if column not in indents:

451

raise IndentationError(

452

"unindent does not match any outer indentation level",

453

("<tokenize>", lnum, pos, line))

454

indents = indents[:-1]

Yury Selivanov

8fb307c

2015-07-22 13:33:45 +0300

[diff] [blame]

455

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

456

if async_def and async_def_indent >= indents[-1]:

457

async_def = False

458

async_def_nl = False

459

async_def_indent = 0

Yury Selivanov

8fb307c

2015-07-22 13:33:45 +0300

[diff] [blame]

460

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

461

yield (DEDENT, '', (lnum, pos), (lnum, pos), line)

462

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

463

if async_def and async_def_nl and async_def_indent >= indents[-1]:

async_def = False

async_def_nl = False

async_def_indent = 0

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

468

else: # continued statement

469

if not line:

Martin v. Löwis

2008-03-19 05:33:36 +0000

[diff] [blame]

470

raise TokenError("EOF in multi-line statement", (lnum, 0))

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

continued = 0

while pos < max:

pseudomatch = pseudoprog.match(line, pos)

475

if pseudomatch: # scan for tokens

476

start, end = pseudomatch.span(1)

477

spos, epos, pos = (lnum, start), (lnum, end), end

478

token, initial = line[start:end], line[start]

479

480

if initial in numchars or \

481

(initial == '.' and token != '.'): # ordinary number

482

yield (NUMBER, token, spos, epos, line)

483

elif initial in '\r\n':

484

newline = NEWLINE

485

if parenlev > 0:

486

newline = NL

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

487

elif async_def:

488

async_def_nl = True

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

489

if stashed:

490

yield stashed

491

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

492

yield (newline, token, spos, epos, line)

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

493

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

494

elif initial == '#':

495

assert not token.endswith("\n")

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

496

if stashed:

497

yield stashed

498

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

499

yield (COMMENT, token, spos, epos, line)

500

elif token in triple_quoted:

501

endprog = endprogs[token]

502

endmatch = endprog.match(line, pos)

503

if endmatch: # all on one line

504

pos = endmatch.end(0)

505

token = line[start:pos]

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

506

if stashed:

507

yield stashed

508

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

509

yield (STRING, token, spos, (lnum, pos), line)

510

else:

511

strstart = (lnum, start) # multiple lines

512

contstr = line[start:]

513

contline = line

514

break

515

elif initial in single_quoted or \

516

token[:2] in single_quoted or \

517

token[:3] in single_quoted:

518

if token[-1] == '\n': # continued string

519

strstart = (lnum, start)

520

endprog = (endprogs[initial] or endprogs[token[1]] or

521

endprogs[token[2]])

522

contstr, needcont = line[start:], 1

523

contline = line

524

break

525

else: # ordinary string

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

526

if stashed:

527

yield stashed

528

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

529

yield (STRING, token, spos, epos, line)

530

elif initial in namechars: # ordinary name

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

531

if token in ('async', 'await'):

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

532

if async_def:

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

533

yield (ASYNC if token == 'async' else AWAIT,

534

token, spos, epos, line)

535

continue

536

537

tok = (NAME, token, spos, epos, line)

538

if token == 'async' and not stashed:

stashed = tok

continue

if token == 'def':

if (stashed

and stashed[0] == NAME

545

and stashed[1] == 'async'):

546

Yury Selivanov

2015-07-23 15:01:58 +0300

[diff] [blame]

547

async_def = True

548

async_def_indent = indents[-1]

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

549

550

yield (ASYNC, stashed[1],

551

stashed[2], stashed[3],

552

stashed[4])

553

stashed = None

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

if stashed:

yield stashed

stashed = None

yield tok

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

560

elif initial == '\\': # continued stmt

561

# This yield is new; needed for better idempotency:

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

562

if stashed:

563

yield stashed

564

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

565

yield (NL, token, spos, (lnum, pos), line)

566

continued = 1

567

else:

568

if initial in '([{': parenlev = parenlev + 1

569

elif initial in ')]}': parenlev = parenlev - 1

Yury Selivanov

2015-05-11 22:57:16 -0400

[diff] [blame]

570

if stashed:

571

yield stashed

572

stashed = None

Martin v. Löwis

2008-03-19 05:04:44 +0000

[diff] [blame]

573

yield (OP, token, spos, epos, line)

574

else:

575

yield (ERRORTOKEN, line[pos],

576

(lnum, pos), (lnum, pos+1), line)

577

pos = pos + 1

578

Yury Selivanov